Calculus#

[1]:
import math
import re
import timeit
import warnings
from collections import ChainMap
from itertools import product, combinations

import cvxopt
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import sympy as sp
from IPython.core.getipython import get_ipython
from IPython.display import HTML, Math
from matplotlib.animation import FuncAnimation
from scipy.interpolate import interp1d
from shapely.geometry import LineString, Polygon

plt.style.use("seaborn-v0_8-whitegrid")
pio.renderers.default = "plotly_mimetype+notebook"

Derivatives (week 1)#

Average rate of change of a function#

[2]:
def constant(x, c, *args):
    return np.full_like(x, c)


def linear(x, m, *args):
    return x * m


def quadratic(x, *args):
    return np.power(x, 2)


def cubic(x, *args):
    return np.power(x, 3)


def fractional(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x == 0] = np.nan
    elif x == 0:
        x = np.nan
    return np.power(x, -1)


def squareroot(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x < 0] = np.nan
    elif x < 0:
        x = np.nan
    return np.power(x, 1 / 2)


def exponential(x, *args):
    return np.exp(x)


def logarithmic(x, b, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x <= 0] = np.nan
    elif x <= 0:
        x = np.nan
    return np.log(x) / np.log(b)


def sin(x, *args):
    return np.sin(x)


def cos(x, *args):
    return np.cos(x)


def abs(x, *args):
    return np.abs(x)


def sigmoid(x, *args):
    return 1 / (1 + np.exp(-x))


def relu(x, *args):
    # np.maximum: element-wise max, not np.max along axis
    return np.maximum(0, x)


def tanh(x, *args):
    return np.tanh(x)


x = np.linspace(-6, 6, 400)
funcs = [
    constant,
    linear,
    quadratic,
    cubic,
    fractional,
    squareroot,
    exponential,
    logarithmic,
    logarithmic,
    sin,
    cos,
    abs,
    sigmoid,
    relu,
    tanh,
]
args = [
    [0],
    [2],
    [None],
    [None],
    [None],
    [None],
    [None],
    [np.e],
    [10],
    [None],
    [None],
    [None],
    [None],
    [None],
    [None],
]

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
    ax.scatter(x, f(x, *arg), s=5)
    ax.set_title(
        f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
        loc="right",
    )
    ax.spines[["left", "bottom"]].set_position(("data", 0))
    ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
    ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle("Common functions", fontsize=15)
plt.show()
../_images/notebooks_calculus_4_0.png

Let \(x = 0.25\) and \(\Delta x = 1.25\), we calculate the average rate of change of each function wrt \(x\) over the interval \(\Delta x\) as

📐 \(\cfrac{\Delta f}{\Delta x} = \cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)

[3]:
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
    ax.scatter(x, f(x, *arg), s=5)
    x_1 = 0.25
    d_x = 1.25
    ax.scatter(x_1, f(x_1, *arg), s=20, c="tab:orange")
    ax.scatter(x_1 + d_x, f(x_1 + d_x, *arg), s=20, c="tab:orange")
    ax.plot([x_1, x_1 + d_x], [f(x_1, *arg), f(x_1, *arg)], "--", color="tab:orange")
    ax.plot(
        [x_1 + d_x, x_1 + d_x],
        [f(x_1 + d_x, *arg), f(x_1, *arg)],
        "--",
        color="tab:orange",
    )
    ax.plot(
        [x_1, x_1 + d_x],
        [f(x_1, *arg), f(x_1 + d_x, *arg)],
        "--",
        linewidth=2,
        color="tab:green",
    )
    slope = (f(x_1 + d_x, *arg) - f(x_1, *arg)) / d_x
    ax.text(
        x_1 + d_x,
        f(x_1 + d_x, *arg),
        f"{slope:.2f}",
        fontsize=12,
        color="k",
        bbox=dict(facecolor="tab:green", alpha=0.8),
    )
    ax.set_title(
        f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
        loc="right",
    )
    ax.spines[["left", "bottom"]].set_position(("data", 0))
    ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
    ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle(
    "Average rate of change of common functions at $x=0.25$ over $\\Delta x = 1.25$",
    fontsize=15,
)
plt.show()
../_images/notebooks_calculus_6_0.png

As \(\Delta x\) approaches 0 we get the instantaneous rate of change of each function wrt x.

📐 \(\lim_{{\Delta x}\to{0}}\cfrac{\Delta f}{\Delta x} = \lim_{{\Delta x}\to{0}}\cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)

[4]:
fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, arg in zip(axs.flatten(), funcs, args):
    ax.scatter(x, f(x, *arg), s=5)
    x_1 = 0.25
    h = 1e-6
    # y-y1 = m(x-x1)
    # y = m(x - x1) + y1
    slope = (f(x_1 + h, *arg) - f(x_1, *arg)) / h
    tan_range = np.linspace(-1, 1)
    ax.plot(
        tan_range,
        slope * (tan_range - x_1) + f(x_1, *arg),
        "--",
        color="tab:green",
    )
    ax.text(
        x_1,
        f(x_1, *arg),
        f"{slope:.2f}",
        fontsize=12,
        color="k",
        bbox=dict(facecolor="tab:green", alpha=0.8),
    )
    ax.set_title(
        f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
        loc="right",
    )
    ax.spines[["left", "bottom"]].set_position(("data", 0))
    ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
    ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle(
    "Instanteneous rate of change of common functions at $x=0.25$", fontsize=15
)
plt.show()
../_images/notebooks_calculus_8_0.png

Derivative of a function#

🔑 The derivative of the function \(f(x)\) wrt \(x\) is \(\cfrac{d}{dx}f(x) = \lim_{{\Delta x}\to{0}}\cfrac{\Delta f}{\Delta x} = \lim_{{\Delta x}\to{0}}\cfrac{f(x + \Delta x) - f(x)}{\Delta x}\)

The derivative of the common functions seen so far are:

\(\cfrac{d}{dx}(k) = \lim_{{\Delta x}\to{0}} \cfrac{k - k}{\Delta x} = 0\)

\(\cfrac{d}{dx}(2x) = \lim_{{\Delta x}\to{0}} \cfrac{2(x+\Delta x) - 2x}{\Delta x} = 2\)

\(\cfrac{d}{dx}(x^2) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^2 - x^2}{\Delta x} = 2x\)

\(\cfrac{d}{dx}(x^3) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^3 - x^3}{\Delta x} = 3x^2\)

\(\cfrac{d}{dx}\left( \cfrac{1}{x} \right) = \cfrac{d}{dx}(x^{-1}) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^{-1} - x^{-1}}{\Delta x} = -x^{-2} \text{ for } x \ne 0\)

\(\cfrac{d}{dx}(\sqrt{x}) = \cfrac{d}{dx}(x^{\frac{1}{2}}) = \lim_{{\Delta x}\to{0}} \cfrac{(x+\Delta x)^{\frac{1}{2}} - x^{\frac{1}{2}}}{\Delta x} = \cfrac{1}{2}x^{-\frac{1}{2}} \text{ for } x \ge 0\)

\(\cfrac{d}{dx}(e^x) = \lim_{{\Delta x}\to{0}} \cfrac{e^{x+\Delta x} - e^x}{\Delta x} = e^x\)

\(\cfrac{d}{dx}(\log_e(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\ln(x+\Delta x) - \ln(x)}{\Delta x} = \cfrac{1}{x\ln(e)} \text{ for } x > 0\)

\(\cfrac{d}{dx}(\log_{10}(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\log_{10}(x+\Delta x) - \log_{10}(x)}{\Delta x} = \cfrac{1}{x \ln(10)} \text{ for } x > 0\)

\(\cfrac{d}{dx}(\sin(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\sin(x+\Delta x) - \sin(x)}{\Delta x} = \cos(x)\)

\(\cfrac{d}{dx}(\cos(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\cos(x+\Delta x) - \cos(x)}{\Delta x} = -\sin(x)\)

\(\cfrac{d}{dx}(|x|) = \lim_{{\Delta x}\to{0}} \cfrac{|x+\Delta x| - |x|}{\Delta x} = \begin{cases}1 \text{ if } x > 0\\-1 \text{ if } x < 0\\\text{undefined if } x = 0\end{cases}\)

\(\cfrac{d}{dx}(\sigma(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\sigma(x+\Delta x) - \sigma(x)}{\Delta x} = \sigma(x)(1-\sigma(x))\)

\(\cfrac{d}{dx}(\text{ReLU}(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\text{ReLU}(x+\Delta x) - \text{ReLU}(x)}{\Delta x} = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x < 0\\\text{undefined if } x = 0\end{cases}\)

\(\cfrac{d}{dx}(\tanh(x)) = \lim_{{\Delta x}\to{0}} \cfrac{\tanh(x+\Delta x) - \tanh(x)}{\Delta x} = 1 - \text{tanh}(x)^2\)

[5]:
def d_constant(x, *args):
    return np.full_like(x, 0)


def d_linear(x, m, *args):
    return np.full_like(x, m)


def d_quadratic(x, *args):
    return 2 * x


def d_cubic(x, *args):
    return 3 * (x**2)


def d_fractional(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x == 0] = np.nan
    elif x == 0:
        x = np.nan
    return -(x**-2)


def d_squareroot(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x < 0] = np.nan
    elif x < 0:
        x = np.nan
    return (1 / 2) * x ** (-1 / 2)


def d_exponential(x, *args):
    return np.exp(x)


def d_logarithmic(x, b, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x <= 0] = np.nan
    elif x <= 0:
        x = np.nan
    return 1 / (x * np.log(b))


def d_sin(x, *args):
    return np.cos(x)


def d_cos(x, *args):
    return -np.sin(x)


def d_abs(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x == 0] = np.nan
    elif x == 0:
        x = np.nan
    return np.sign(x)


def d_sigmoid(x, *args):
    return sigmoid(x) * (1 - sigmoid(x))


def d_relu(x, *args):
    if isinstance(x, np.ndarray):
        x = x.copy()
        x[x == 0] = np.nan
    elif x == 0:
        x = np.nan
    return np.sign(relu(x))


def d_tanh(x, *args):
    return 1 - np.tanh(x) ** 2


d_funcs = [
    d_constant,
    d_linear,
    d_quadratic,
    d_cubic,
    d_fractional,
    d_squareroot,
    d_exponential,
    d_logarithmic,
    d_logarithmic,
    d_sin,
    d_cos,
    d_abs,
    d_sigmoid,
    d_relu,
    d_tanh,
]

fig, axs = plt.subplots(nrows=3, ncols=5, figsize=(5 * 4, 3 * 4))
for ax, f, df, arg in zip(axs.flatten(), funcs, d_funcs, args):
    ax.scatter(x, f(x, *arg), s=5)
    f_lims = np.array(ax.get_ylim())
    ax.scatter(x, df(x, *arg), s=5)
    df_lims = np.array(ax.get_ylim())
    if any(abs(df_lims) > 10 * abs(f_lims)):
        ax.set_ylim(f_lims)
    ax.set_title(
        f.__name__ + " " + " ".join([str(a)[:3] for a in arg if a is not None]),
        loc="right",
    )
    ax.spines[["left", "bottom"]].set_position(("data", 0))
    ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
    ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.suptitle("Derivatives of common functions", fontsize=15)
plt.show()
../_images/notebooks_calculus_11_0.png

Not differentiable functions#

🔑 A function is not differentiable at \(x = a\) if \(\lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\) does not exist

Visually, we can determine whether a function is not differentiable if we see

  1. a cusp or a corner

  2. a jump or point of discontinuity

  3. a vertical tangent

Proof that |x| is not differentiable at 0#

\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)

\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)

\(\lim_{x\to{0}}\cfrac{|x|}{x}\)

\(\lim_{x\to{0}^+}\cfrac{x}{x} = 1\)

\(\lim_{x\to{0}^-}\cfrac{-x}{x} = -1\)

\(\lim_{x\to{0}}\cfrac{|x|}{x}\) does not exist because \(\lim_{x\to{0}^+} \ne \lim_{x\to{0}^-}\)

Proof that ReLU is not differentiable at 0#

\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)

\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)

\(\lim_{x\to{0}}\cfrac{\text{ReLU}(x)}{x}\)

\(\lim_{x\to{0}^+}\cfrac{x}{x} = 1\)

\(\lim_{x\to{0}^-}\cfrac{0}{x} = 0\)

\(\lim_{x\to{0}}\cfrac{\text{ReLU}(x)}{x}\) does not exist because \(\lim_{x\to{0}^+} \ne \lim_{x\to{0}^-}\)

That being said the universal convention in machine learning applications is to assign a derivative of 0 at the non-differentiable point 0, such that

\(\text{ReLU}'(x) = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x \le 0\end{cases}\)

This won’t make a lot of sense now, because we haven’t introduced backward propagation yet, but it’s a bit imprecise to say that “conventionally the derivative of ReLU at 0 is 0”.

Frameworks like Tensorflow, for example, just do not propagate the gradient when the ReLU activation is 0.

Source: https://github.com/tensorflow/tensorflow/blob/0aecf379b7bbdbe93be91643825f0ae94171d509/tensorflow/core/kernels/relu_op_functor.h#L52

Proof that radicals are not differentiable at 0#

\(\cfrac{df}{dx}(a) = \lim_{x\to{a}}\cfrac{f(x)-f(a)}{x-a}\)

\(\lim_{x\to{0}}\cfrac{f(x)-f(0)}{x-0}\)

\(\lim_{x\to{0}^+}\cfrac{\sqrt{x}}{x}\)

\(\lim_{x\to{0}^+}\cfrac{\sqrt{x}\sqrt{x}}{x\sqrt{x}}\)

\(\lim_{x\to{0}^+}\cfrac{x}{x\sqrt{x}}\)

\(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}}\)

\(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}}\) does not exist because \(\lim_{x\to{0}^+}\cfrac{1}{\sqrt{x}} \rightarrow +\infty\)

The inverse function and its derivative#

\(\sqrt{x}\) is the inverse of \(x^2\), because \(x \xrightarrow{x^2} x^2 \xrightarrow{\sqrt{x}} x\)

🔑 \(g(y)\) is the inverse of \(f(x)\) if \(x \xrightarrow{f(x)} y \xrightarrow{g(y)} x\). In other words the inverse function undoes what the function does.

\(\cfrac{1}{x}\) is NOT the inverse of \(x\), because \(x \xrightarrow{x} x \xrightarrow{\frac{1}{x}} \cfrac{1}{x}\)

\(\ln x\) is the inverse of \(e^x\), because \(x \xrightarrow{\ln x} \ln x \xrightarrow{e^x} x\)

🔑 The derivative of the inverse function is \(g\prime(y) = \cfrac{1}{f\prime(x)}\)

Let’s verify it.

\(\cfrac{d}{dy}\ln{y} = \cfrac{1}{y}\) but also \(\cfrac{1}{\cfrac{d}{dx}e^{\ln y}} = \cfrac{1}{y}\)

\(\cfrac{d}{dy}\sqrt{y} = \cfrac{d}{dy}y^{\frac{1}{2}} = \cfrac{1}{2}y^{-\frac{1}{2}}\) but also \(\cfrac{1}{\cfrac{d}{dx}\left( y^{\frac{1}{2}} \right)^2} = \cfrac{1}{2y^{\frac{1}{2}}} = \cfrac{1}{2}y^{-\frac{1}{2}} = \cfrac{1}{2}\cfrac{1}{y^\frac{1}{2}} = \cfrac{1}{2}y^{-\frac{1}{2}}\)

[6]:
if squareroot(quadratic(x_1)) == x_1:
    assert 1 / d_quadratic(x_1) == d_squareroot(quadratic(x_1))

if np.exp(np.log(x_1)):
    assert 1 / d_exponential(x_1) == d_logarithmic(exponential(x_1), b=np.e)
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[6], line 5
      2     assert 1 / d_quadratic(x_1) == d_squareroot(quadratic(x_1))
      4 if np.exp(np.log(x_1)):
----> 5     assert 1 / d_exponential(x_1) == d_logarithmic(exponential(x_1), b=np.e)

AssertionError:

Derivative rules#

Constant multiple rule#

📐 \(\cfrac{d}{dx}[cf(x)] = cf'(x)\)

Example:

\(y = 5x^2\)

\(\cfrac{dy}{dx} = 5 \cfrac{d}{dx}x^2\)

\(\cfrac{dy}{dx} = 10x\)

[7]:
x = sp.symbols("x")
expr = 5 * x**2
expr.diff()
[7]:
$\displaystyle 10 x$

Sum or difference rule#

📐 \(\cfrac{d}{dx}[f(x) + g(x)] = f'(x) + g'(x)\)

Example:

\(y = 5x^2 + 3x + 1\)

\(\cfrac{dy}{dx} = \cfrac{d}{dx}5x^2 + \cfrac{d}{dx}3x + 1\)

\(\cfrac{dy}{dx} = 10x + 3\)

[8]:
x = sp.symbols("x")
expr = 5 * x**2 + 3 * x + 1
expr.diff()
[8]:
$\displaystyle 10 x + 3$

Product rule#

📐 \(\cfrac{d}{dx}[f(x)g(x)] = f'(x)g(x) + f(x)g'(x)\)

Example:

\(y = (5x^2)(3x + 1)\)

\(\cfrac{dy}{dx} = \cfrac{d}{dx}(5x^2)(3x + 1) + \cfrac{d}{dx}(3x + 1)(5x^2)\)

\(\cfrac{dy}{dx} = 5\left(\cfrac{d}{dx}(x^2)(3x + 1) + \cfrac{d}{dx}(3x + 1)(x^2)\right)\)

\(\cfrac{dy}{dx} = 5(2x(3x+1) + 3(x^2))\)

\(\cfrac{dy}{dx} = 5(6x^2+2x+3x^2)\)

\(\cfrac{dy}{dx} = 5(9x^2+2x)\)

[9]:
x = sp.symbols("x")
expr = (5 * x**2) * (3 * x + 1)
expr.diff().simplify()
[9]:
$\displaystyle 5 x \left(9 x + 2\right)$

Quotient rule#

📐 \(\cfrac{d}{dx}\left[\cfrac{f(x)}{g(x)}\right] = \cfrac{f'(x)g(x) - f(x)g'(x)}{g(x)^2}\)

Example:

\(y = \cfrac{5x^2}{3x + 1}\)

\(\cfrac{dy}{dx} = \cfrac{\cfrac{d}{dx}(5x^2)(3x + 1)-\cfrac{d}{dx}(3x + 1)(5x^2)}{(3x + 1)^2}\)

\(\cfrac{dy}{dx} = 5\left(\cfrac{\cfrac{d}{dx}(x^2)(3x + 1)-\cfrac{d}{dx}(3x + 1)(x^2)}{(3x + 1)^2}\right)\)

\(\cfrac{dy}{dx} = 5\left(\cfrac{2x(3x+1) - 3(x^2)}{(3x + 1)^2}\right)\)

\(\cfrac{dy}{dx} = 5\left(\cfrac{6x^2+2x-3x^2}{(3x + 1)^2}\right)\)

\(\cfrac{dy}{dx} = \cfrac{5(3x^2+2x)}{(3x + 1)^2}\)

[10]:
x = sp.symbols("x")
expr = (5 * x**2) / (3 * x + 1)
expr.diff().simplify().factor()
[10]:
$\displaystyle \frac{5 x \left(3 x + 2\right)}{\left(3 x + 1\right)^{2}}$

Chain rule#

📐 \(\cfrac{d}{dx}[g(f(x))] = g'(f(x))f'(x)\)

Example I:

\(y = 5(3x + 1)^2\)

\(\cfrac{dy}{dx} = 5\left(\cfrac{d}{dx}(3x+1)^2\cfrac{d}{dx}3x + 1\right)\)

\(\cfrac{dy}{dx} = 5(2(3x+1)3)\)

\(\cfrac{dy}{dx} = 30(3x+1)\)

[11]:
x = sp.symbols("x")
expr = 5 * (3 * x + 1) ** 2
expr.diff().simplify().factor()
[11]:
$\displaystyle 30 \cdot \left(3 x + 1\right)$

Example II:

\(y = e^{3x+1}\)

\(\cfrac{dy}{dx} = \cfrac{d}{dx}e^{3x+1}\cfrac{d}{dx}3x+1\)

\(\cfrac{dy}{dx} = e^{3x+1}3\)

[12]:
x = sp.symbols("x")
expr = sp.exp(3 * x + 1)
expr.diff()
[12]:
$\displaystyle 3 e^{3 x + 1}$

Univariate optimization (week 1)#

The probability of getting \(k\) successes in \(n\) independent Bernoulli trials with success probability \(p\) for each trial is:

\(Pr(k, n, p) = \binom{n}{k}p^k(1-p)^{n-k}\)

To motivate the maximization of \(Pr(k, n, p)\) wrt \(p\), let’s imagine we’re playing a game where we need to toss a coin 10 times and we win only if we get 7 heads and 3 tails.

We can use a biased coin for this game and we can customize the bias \(p\).

To maximize \(Pr(k, n, p)\) wrt \(p\) we can omit the binomial coefficient.

\(p^k(1-p)^{n-k}\)

[13]:
p, k, n = sp.symbols("p, k, n")
pr = p**k * (1 - p) ** (n - k)
pr
[13]:
$\displaystyle p^{k} \left(1 - p\right)^{- k + n}$

Let’s find the derivative wrt \(p\).

[14]:
dprdp = sp.diff(pr, p)
dprdp.simplify().factor()
[14]:
$\displaystyle - \frac{k p^{k} \left(1 - p\right)^{- k + n + 1} + k p^{k + 1} \left(1 - p\right)^{- k + n} - n p^{k + 1} \left(1 - p\right)^{- k + n}}{p \left(p - 1\right)}$

Let’s evaluate it for \(k=7\) and \(n=10\) and visualize both \(Pr(7, 10, p)\) and \(\cfrac{d}{dp}Pr(7, 10, p)\)

[15]:
def move_spplot_to_axes(p, ax):
    backend = p.backend(p)
    backend.ax = ax
    backend._process_series(backend.parent._series, ax, backend.parent)
    backend.ax.spines["right"].set_color("none")
    backend.ax.spines["bottom"].set_position("zero")
    backend.ax.spines["top"].set_color("none")
    plt.close(backend.fig)


p1 = sp.plot(
    dprdp.evalf(subs={k: 7, n: 10}),
    pr.evalf(subs={k: 7, n: 10}),
    (p, 0, 1),
    legend=True,
    ylabel="$Pr(7, 10, p)$",
    title="Function and derivative evaluated for $k=7$ and $n=10$",
    show=False,
)

fig, ax = plt.subplots()
move_spplot_to_axes(p1, ax)
plt.axhline(
    y=pr.evalf(subs={k: 7, n: 10, p: 0.7}),
    xmin=0.5,
    xmax=0.9,
    color="tab:orange",
    linestyle="--",
    linewidth=1,
)
plt.scatter(0.7, 0, zorder=3)
plt.xticks(np.linspace(0, 1, 11))
plt.show()
../_images/notebooks_calculus_47_0.png

When \(k = 7\) and \(n = 10\), the maximum of \(Pr(7, 10, p)\) is for \(p=0.7\), that is when \(\cfrac{d}{dp}Pr(7, 10, p) = 0\)

Let’s prove it analytically.

[16]:
sp.solve(dprdp, p)
[16]:
[k/n]

Generally, \(Pr(k, n, p)\) is maximized for \(p=k/n\).

Let’s evaluate it for \(k = 7\) and \(n = 10\).

[17]:
sp.solve(dprdp, p)[0].evalf(subs={k: 7, n: 10})
[17]:
$\displaystyle 0.7$

It turns out there is an easier way to maximize \(p^k(1-p)^{n-k}\).

And it’s based on the fact that \(\max_{p} p^k(1-p)^{n-k} = \max_{p} \ln p^k(1-p)^{n-k}\).

Using the product rule \(\ln (ab) = \ln a + \ln b\) and the power rule \(\ln a^b = b \ln a\) we get

\(\ln p^k(1-p)^{n-k} = k \ln p + (n-k) \ln (1-p)\)

Calculating the derivative of \(k \ln p + (n-k) \ln (1-p)\) is a lot easier

\(\cfrac{d}{dx}k \ln p + (n-k) \ln (1-p)\)

\(\cfrac{d}{dx}k \ln p + \cfrac{d}{dx}(n-k) \ln (1-p)\)

\(k \cfrac{d}{dx} \ln p + (n-k)\cfrac{d}{dx} \ln (1-p)\)

\(k \cfrac{1}{p} + (n-k)\cfrac{1}{1-p}(-1)\)

\(k \cfrac{1}{p} - (n-k)\cfrac{1}{1-p}\)

Now let’s solve it.

\(k \cfrac{1}{p} - (n-k)\cfrac{1}{1-p} = 0\)

\(\cfrac{k(1-p)-(n-k)p}{p(1-p)} = 0\)

\(k(1-p)-(n-k)p = 0\)

\(k - kp - np + kp = 0\)

\(k - np = 0\)

\(p = \cfrac{k}{n}\)

Computational efficiency of symbolic, numerical and automatic differentiation (week 1)#

Symbolic differentiation produces exact derivatives like when computing derivatives by hand. It’s fast for simple functions, but it slows down as the complexity increases.

Numerical differentiation produces an approximation that is somewhat similar to computing the instantaneous rate of change for a very small \(\Delta x\). It’s slow.

Automatic differentiation produces an approximation by constructing a computational graph consisting of basic functions and computing the derivative at each node using the chain rule. It’s fast even for complex functions. It’s the most common approach used in neural networks.

[18]:
def get_setup(k, v):
    sym_setup = f"""
import sympy as sp
import numpy as np
x_arr = np.linspace(-6, 6, 1000000)
def f(x):
    return {v}
def diff():
    x = sp.symbols("x")
    return sp.lambdify(x, sp.diff(f(x), x), "numpy")(x_arr)
"""

    num_setup = f"""
import numpy as np
x_arr = np.linspace(-6, 6, 1000000)
def f(x):
    return {v}
def diff():
    return np.gradient(f(x_arr), x_arr)
"""

    aut_setup = f"""
from jax import vmap, grad
import jax.numpy as jnp
x_arr = jnp.linspace(-6, 6, 1000000)
def f(x):
    return {v}
def diff():
    return vmap(grad(f))(x_arr)
"""

    setup = {
        "symbolic": sym_setup,
        "numerical": num_setup,
        "automatic": aut_setup,
    }

    return setup[k]


res = {
    "symbolic": {
        "x**2": 0,
        "2*x**3 - 3*x**2 + 5": 0,
        "sp.exp(-2*x) + 3*sp.sin(3*x)": 0,
    },
    "numerical": {
        "x**2": 0,
        "2*x**3 - 3*x**2 + 5": 0,
        "np.exp(-2*x) + 3*np.sin(3*x)": 0,
    },
    "automatic": {
        "x**2": 0,
        "2*x**3 - 3*x**2 + 5": 0,
        "jnp.exp(-2*x) + 3*jnp.sin(3*x)": 0,
    },
}
for k, vv in res.items():
    for v in vv.keys():
        r = timeit.repeat("diff()", number=100, repeat=3, setup=get_setup(k, v))
        res[k][v] = np.array(r).mean()

x_lab = [r"$x^2$", r"$2x^3-3x^2+5$", r"$e^{-2x}+3\sin(3x)$"]
plt.plot(x_lab, res["symbolic"].values(), marker="o", linestyle="--", label="symbolic")
plt.plot(
    x_lab, res["numerical"].values(), marker="o", linestyle="--", label="numerical"
)
plt.plot(
    x_lab, res["automatic"].values(), marker="o", linestyle="--", label="automatic"
)
plt.xlabel("function")
plt.ylabel("seconds")
plt.title(
    "Time taken for evaluating 100 differentiations using an array of shape 1,000,000"
)
plt.legend()
plt.show()
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1698064049.344074    2017 tfrt_cpu_pjrt_client.cc:349] TfrtCpuClient created.
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
../_images/notebooks_calculus_57_1.png

Multivariate optimization (week 2)#

Tangent plane#

The equation of the tangent line to \(f(x)\) at point \(x=a\) is

📐 \(y = \cfrac{d}{dx}f(a)(x-a) + f(a)\)

This is derived from the point-slope form of a line

\(y-y_1 = m(x-x_1)\)

The equation of the tangent plane to \(f(x, y)\) at point \(x=a\) and \(y=b\) is

📐 \(z = \cfrac{\partial}{\partial x}f(a)(x-a) + \cfrac{\partial}{\partial y}f(b)(y-b) + f(a, b)\)

which, similarly, is derived from the point-slope form of a plane

\(z-z_1 = m_1(x-x_1) + m_2(y-y_1)\)

[19]:
def tangent_line(dfa, a, sym_a, a_range, b, sym_b, f):
    return (
        dfa.evalf(subs={sym_a: a, sym_b: b}) * (a_range - a)
        + f.evalf(subs={x: a, y: b})
    ).astype("float32")


def tangent_plane(dfa, dfb, a, a_range, b, b_range, f):
    return (
        dfa.evalf(subs={x: a, y: b}) * (a_range - a)
        + dfb.evalf(subs={x: a, y: b}) * (b_range - b)
        + f.evalf(subs={x: a, y: b})
    ).astype("float32")


x, y = sp.symbols("x, y")
parabloid = x**2 + y**2
dfx = sp.diff(parabloid, x)
dfy = sp.diff(parabloid, y)
x0 = 2
y0 = 4
full_range = np.linspace(-8, 8, 100)
y_cut_xx, y_cut_yy = np.meshgrid(full_range, np.linspace(-8, y0, 100))
xcut_xx, xcut_yy = np.meshgrid(np.linspace(-8, x0, 100), full_range)
full_xx, full_yy = np.meshgrid(full_range, full_range)
tan_x = np.linspace(x0 - 4, x0 + 4, 100)
tan_y = np.linspace(y0 - 4, y0 + 4, 100)
tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)
const_x = np.full(100, x0)
const_y = np.full(100, y0)

ycut_parabloid_surface = go.Surface(
    z=sp.lambdify((x, y), parabloid, "numpy")(y_cut_xx, y_cut_yy),
    x=y_cut_xx,
    y=y_cut_yy,
    colorscale="Blues",
    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
    colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
    showlegend=True,
    legendgrouptitle_text="Partial derivative wrt x",
    legendgroup="x",
    name="y-cut parabloid",
)
xcut_parabloid_surface = go.Surface(
    z=sp.lambdify((x, y), parabloid, "numpy")(xcut_xx, xcut_yy),
    x=xcut_xx,
    y=xcut_yy,
    colorscale="Blues",
    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
    colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
    showlegend=True,
    legendgrouptitle_text="Partial derivative wrt y",
    legendgroup="y",
    name="x-cut parabloid",
)
full_parabloid_surface = go.Surface(
    z=sp.lambdify((x, y), parabloid, "numpy")(full_xx, full_yy),
    x=full_xx,
    y=full_yy,
    colorscale="Blues",
    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
    colorbar=dict(orientation="h", y=-0.2, title=dict(text="z", side="top")),
    showlegend=True,
    name="full parabloid",
)
poi = go.Scatter3d(
    x=[x0],
    y=[y0],
    z=[sp.lambdify((x, y), parabloid, "numpy")(x0, y0)],
    marker=dict(color="#000000"),
    showlegend=True,
    name="x=2 y=4",
)
yparabola = go.Scatter3d(
    x=const_x,
    y=full_range,
    z=sp.lambdify((x, y), parabloid, "numpy")(const_x, full_range),
    mode="lines",
    line=dict(color="#000000", width=5),
    showlegend=True,
    legendgroup="y",
    name="y parabola",
)
ytangent = go.Scatter3d(
    x=const_x,
    y=tan_y,
    z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=parabloid),
    mode="lines",
    line=dict(color="#000000"),
    showlegend=True,
    legendgroup="y",
    name="y tangent",
)
xparabola = go.Scatter3d(
    x=full_range,
    y=const_y,
    z=sp.lambdify((x, y), parabloid, "numpy")(full_range, const_y),
    mode="lines",
    line=dict(color="#000000", width=5),
    showlegend=True,
    legendgroup="x",
    name="x parabola",
)
xtangent = go.Scatter3d(
    x=tan_x,
    y=const_y,
    z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=parabloid),
    mode="lines",
    line=dict(color="#000000"),
    showlegend=True,
    legendgroup="x",
    name="x tangent",
)
tangent_surface = go.Surface(
    z=tangent_plane(
        dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=parabloid
    ),
    x=tan_xx,
    y=tan_yy,
    colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
    showscale=False,
    name="tangent plane",
    showlegend=True,
)
fig = go.Figure(
    data=[
        full_parabloid_surface,
        xcut_parabloid_surface,
        ycut_parabloid_surface,
        poi,
        xtangent,
        xparabola,
        ytangent,
        yparabola,
        tangent_surface,
    ]
)
fig.update_layout(
    title="Tangent plane of parabloid at x=2 and y=4",
    autosize=False,
    width=600,
    height=600,
    margin=dict(l=10, r=10, b=10, t=30),
    legend=dict(groupclick="togglegroup", itemclick="toggleothers"),
    scene_camera=dict(
        eye=dict(x=1.5, y=1.5, z=0.5),
    ),
)
fig.show()

Partial derivatives#

If we look at the tangent plane on the previous plot from a certain angle we’ll see two orthogonal lines, as if they were the axes of the plane.

😉 click on Reset camera to last save on the plot’s navbar to reset the eye to (x=1.5, y=1.5, z=.5)

The two lines that form the tangent plane are the tangent lines to the point and their respective slopes are called partial derivatives.

To visualize (or at least get a sense of it) the partial derivative:

\(\cfrac{\partial}{\partial x}f(x)\) at \(x=2\) select the legend group Partial derivative wrt x.

\(\cfrac{\partial}{\partial y}f(y)\) at \(y=4\) select the legend group Partial derivative wrt y.

In either case, we can see that the partial derivative is just the derivative of the imaginary 2D parabola that results from \(f(x, y)\) while keeping one of the two variables constant.

In fact, calculating partial derivatives is a 2-step process:

  1. treat all the other variables as constants

  2. apply the same rules of differentiations

For example, let \(f(x, y) = x^2+y^2\). To calculate \(\cfrac{\partial}{\partial x}x^2+y^2\) we

  1. treat \(y\) as a constant, let’s say 1, but we don’t usually do this substitution in practice. We’ll do it here to drive the point home.

\(\cfrac{\partial f(x,y)}{\partial x} = x^2+1^2\)

  1. apply the same rules of differentiations (in this case, power, constant and sum rules)

\(\cfrac{\partial f(x,y)}{\partial x} = 2x + 0\)

Let’s do another example. Let let \(f(x, y) = x^2y^2\).

We could do the same as before and replace \(y\) with 1, but in this case and more complex cases it might create more confusion than be helpful, as we’ll have to revert it back to \(y\) if it didn’t go away like it did in the previous example.

Let’s leave \(y\) as is and just treat as a constant. For the power and multiple constant rules we have

\(\cfrac{\partial f(x,y)}{\partial x} = 2xy^2\)

Equaling partial derivatives to 0 to find the minima and maxima#

Let’s imagine we are in a sauna 5 meters wide and 5 meters long. We want to find the coldest place in the room.

Conveniently we know the function of the temperature in terms of the room coordinates.

[20]:
x, y = sp.symbols("x, y")
temp = 50 - sp.Rational(1, 50) * x**2 * (x - 6) * y**2 * (y - 6)
temp
[20]:
$\displaystyle - \frac{x^{2} y^{2} \left(x - 6\right) \left(y - 6\right)}{50} + 50$
[21]:
room_size = 5
xx, yy = np.meshgrid(np.linspace(0, room_size, 100), np.linspace(0, room_size, 100))

surface = go.Surface(
    z=sp.lambdify((x, y), temp, "numpy")(xx, yy),
    x=xx,
    y=yy,
    colorscale="RdBu_r",
    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
    colorbar=dict(title="Temperature"),
    name="temperature function",
)

fig = go.Figure(surface)
fig.update_layout(
    title="Function of the sauna temperature",
    autosize=False,
    width=600,
    height=600,
    scene_aspectmode="cube",
    margin=dict(l=10, r=10, b=10, t=30),
    scene_camera=dict(
        eye=dict(x=2.1, y=0.1, z=0.7),
    ),
)
fig.show()

Let’s calculate the partial derivative wrt x.

[22]:
dfx = sp.diff(temp, x).factor()
dfx
[22]:
$\displaystyle - \frac{3 x y^{2} \left(x - 4\right) \left(y - 6\right)}{50}$

Let’s calculate the partial derivative wrt y.

[23]:
dfy = sp.diff(temp, y).factor()
dfy
[23]:
$\displaystyle - \frac{3 x^{2} y \left(x - 6\right) \left(y - 4\right)}{50}$

Let’s check where the partial derivatives are 0.

[24]:
solutions = {"x": set(), "y": set()}
for s in sp.solve(dfx) + sp.solve(dfy):
    for k, v in s.items():
        if v <= room_size:
            solutions[str(k)].add(float(v))
solutions = list(product(solutions["x"], solutions["y"]))

zs = []
for s in solutions:
    z = sp.lambdify((x, y), temp, "numpy")(s[0], s[1])
    zs.append(z)
    fig.add_scatter3d(
        x=[s[0]],
        y=[s[1]],
        z=[z],
        marker=dict(color="#67001F" if z > 40 else "#053061"),
        name="maximum" if z > 40 else "minimum",
    )
fig.update_layout(
    title="Maxima and minima of the function",
    showlegend=False,
    scene_camera=dict(
        eye=dict(x=2.0, y=-1.0, z=0.2),
    ),
)
fig.show()

Let’s show the tangent plane at the minimum point.

[25]:
x0, y0 = solutions[np.argmin(zs)]
tan_x = np.linspace(x0 - 1, x0 + 1, 100)
tan_y = np.linspace(y0 - 1, y0 + 1, 100)
tan_xx, tan_yy = np.meshgrid(tan_x, tan_y)
const_x = np.full(100, x0)
const_y = np.full(100, y0)

ytangent = go.Scatter3d(
    x=const_x,
    y=tan_y,
    z=tangent_line(dfa=dfy, a=y0, sym_a=y, a_range=tan_y, b=x0, sym_b=x, f=temp),
    mode="lines",
    line=dict(color="#000000"),
    showlegend=True,
    legendgroup="y",
    name="y tangent",
)

xtangent = go.Scatter3d(
    x=tan_x,
    y=const_y,
    z=tangent_line(dfa=dfx, a=x0, sym_a=x, a_range=tan_x, b=y0, sym_b=y, f=temp),
    mode="lines",
    line=dict(color="#000000"),
    showlegend=True,
    legendgroup="x",
    name="x tangent",
)

tangent_surface = go.Surface(
    z=tangent_plane(
        dfa=dfx, dfb=dfy, a=x0, a_range=tan_xx, b=y0, b_range=tan_yy, f=temp
    ),
    x=tan_xx,
    y=tan_yy,
    colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
    showscale=False,
    name="tangent plane",
    showlegend=True,
)

fig.add_traces([ytangent, xtangent, tangent_surface])
fig.update_layout(
    title="Tangent plane at the minimum",
)
fig.show()

Gradient#

🔑 The gradient of \(f\) is a vector, each element of which is a partial derivative of \(f\)

\(\nabla f(x_1, x_2, \cdots, x_n) = \langle \cfrac{\partial f}{\partial x_1}, \cfrac{\partial f}{\partial x_2}, \cdots, \cfrac{\partial f}{\partial x_n} \rangle\)

Assume we are at \(\vec{p}\) in the domain space of \(f\) within \(\mathbb{R}^3\):

\(\vec{p} = \begin{bmatrix}0.3\\0.2\\0.8\\\end{bmatrix}\)

Upon calculating the partial derivatives of \(f(\vec{p})\), suppose we obtain the gradient:

\(\nabla f(\vec{p}) = \begin{bmatrix}0.05\\-0.2\\-0.1\end{bmatrix}\)

This implies that:

  • The function \(f\) is increasing if we move to the right in the first dimension because the slope is positive

  • The function \(f\) is increasing if we move to the left in the other two dimensions because the slope is negative

Furthermore, the second dimension has the steepest ascent, while the first dimension has the flattest.

🔑 \(\nabla f\) provides the direction and rate of fastest increase of \(f\), while \(-\nabla f\) provides the direction and rate of fastest decrease of \(f\)

[26]:
nabla = sp.lambdify(
    (x, y), sp.Matrix([sp.diff(temp, x), sp.diff(temp, y)]), "numpy"
)
p0 = np.array([1.0, 3.0])
g0 = nabla(*p0)
xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))
cs = plt.contourf(
    xx, yy, sp.lambdify((x, y), temp, "numpy")(xx, yy), levels=20, cmap="RdBu_r"
)
plt.scatter(p0[0], p0[1], color="k")
plt.quiver(
    p0[0],
    p0[1],
    -g0[0],
    -g0[1],
    angles="xy",
    scale_units="xy",
    scale=10,
    color="k",
)
plt.xlabel("x")
plt.ylabel("y")
plt.gca().set_aspect("equal")
plt.title(f"Negative gradient stemming from ${sp.latex(p0)}$")
plt.colorbar(cs, label="z")
plt.show()
../_images/notebooks_calculus_79_0.png

Continuing the previous example in \(\mathbb{R}^3\), if our goal is to find a minimum of the function \(f\), one approach is to update our position by taking a step of size \(S\) in the opposite direction of the gradient:

\(\vec{p}_{next} = \langle 0.3, 0.2, 0.8 \rangle - S \cdot \text{sign}(\langle 0.05, -0.2, -0.1 \rangle)\)

This approach doesn’t use all the information contained in the gradient, just the direction. It’s important to use also the steepness of the slope, so that we can take a larger step size if it’s very steep (far from the minimum) and a smaller step size if it’s flat (near the minimum.)

A better approach involves taking a step proportional to the value of the negative gradient.

\(\vec{p}_{next} = \langle 0.3, 0.2, 0.8 \rangle - \alpha \cdot \langle0.05, -0.2, -0.1 \rangle\)

🔑 \(\alpha\) is called learning rate and controls the step size

The above is the main idea of the gradient descent algorithm.

Below the pseudocode of a typical implementation of the gradient descent algorithm for \(T\) steps or until convergence, where convergence is define in terms of the norm of the gradient.

\(\begin{array}{l} \textbf{Algorithm: } \text{Gradient Descent} \\ \textbf{Input: } \text{initial parameters } p_0, \text{ learning rate } \alpha, \text{ number of iterations } T, \text{ convergence } \epsilon \\ \textbf{Output: } \text{final parameters } p\\ \phantom{0}1 : p \gets p_0 \\ \phantom{0}2 : \text{evaluate gradient } \nabla f(p) \\ \phantom{0}3 : \text{for i = 1 to T do} \\ \phantom{0}4 : \quad p_{new} \gets p - \alpha \nabla f(p) \\ \phantom{0}5 : \quad \text{evaluate gradient } \nabla f(p_{new}) \\ \phantom{0}6 : \quad \text{if } \|\nabla f(p_{new})\| < \epsilon \text{ then} \\ \phantom{0}7 : \quad \quad \text{return } p_{new} \\ \phantom{0}8 : \quad \text {end if} \\ \phantom{0}9 : \quad p \gets p_{new} \\ 10: \text{end for} \\ 11: \text{return } p \end{array}\)

🔑 Gradient descent is an iterative algorithm that uses the information contained in the gradient at a point to find the local minimum of a differentiable function

[27]:
def bivariate_gradient_descent(f, symbols, initial, steps, learning_rate):
    x, y = symbols
    nabla = sp.lambdify(
        (x, y), sp.Matrix([sp.diff(f, x), sp.diff(f, y)]), "numpy"
    )
    p = np.zeros((steps, 2))
    g = np.zeros((steps, 2))
    step_vector = np.zeros((steps, 2))
    p[0] = initial
    g[0] = nabla(*p[0]).squeeze()
    step_vector[0] = learning_rate * g[0]
    for i in range(1, steps):
        p[i] = p[i - 1] - step_vector[i - 1]
        g[i] = nabla(*p[i]).squeeze()
        step_vector[i] = learning_rate * g[i]
        if np.linalg.norm(g[i]) < 1e-4:
            break
    return p[:i], g[:i], step_vector[:i]


def fixup_animation_js(html_animation):
    html_animation = html_animation.replace(
        '<div class="anim-controls">',
        '<div class="anim-controls" style="display:none">',
    )
    animation_id = re.findall(r"onclick=\"(.*)\.", html_animation)[0]
    img_id = re.findall(r"<img id=\"(.*)\"", html_animation)[0]
    html_animation += f"""
<script language="javascript">
setupAnimationIntersectionObserver('{animation_id}', '{img_id}');
</script>
"""
    return html_animation


def gradient_descent_animation(f, symbols, initial, steps, learning_rate, lim, cmap):
    def _update(frame):
        global scat, quiv
        scat = ax.scatter(p[:frame, 0], p[:frame, 1], color="k")
        quiv = ax.quiver(
            p[:frame, 0],
            p[:frame, 1],
            -g[:frame, 0],
            -g[:frame, 1],
            angles="xy",
            scale_units="xy",
            scale=10,
            color="k",
        )

    x, y = symbols
    p, g, _ = bivariate_gradient_descent(f, symbols, initial, steps, learning_rate)
    fig, ax = plt.subplots()
    xx, yy = np.meshgrid(np.linspace(0, lim, 100), np.linspace(0, lim, 100))
    cs = ax.contourf(
        xx,
        yy,
        sp.lambdify((x, y), f, "numpy")(xx, yy),
        levels=20,
        cmap=cmap,
    )
    scat = ax.scatter([], [])
    quiv = ax.quiver([], [], [1e-6], [1e-6])
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_aspect("equal")
    plt.colorbar(cs, label="z")
    plt.title(
        rf"Gradient descent from ${sp.latex(p[0])}$ with $\alpha={learning_rate}$"
    )
    ani = FuncAnimation(fig=fig, func=_update, frames=steps)
    html_animation = ani.to_jshtml(default_mode="loop")
    if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
        html_animation = fixup_animation_js(html_animation)
    display(HTML(html_animation))
    plt.close()


gradient_descent_animation(
    temp,
    (x, y),
    initial=np.array([1, 3]),
    steps=10,
    learning_rate=0.1,
    lim=5,
    cmap="RdBu_r",
)

One of the major drawbacks of gradient descent is its sensitivity to the learning rate.

[28]:
gradient_descent_animation(
    temp,
    (x, y),
    initial=np.array([1, 3]),
    steps=20,
    learning_rate=0.3,
    lim=5,
    cmap="RdBu_r",
)

Convergence is not necessarily guaranteed and the learning rate must be chosen carefully.

Another drawback of gradient descent is that convergence doesn’t guarantee we’ve reached the global minimum.

Let’s consider a function with multiple minima.

[29]:
x, y = sp.symbols("x, y")
multi = (
    -(
        10 / (3 + 3 * (x - 0.5) ** 2 + 3 * (y - 0.5) ** 2)
        + 2 / (1 + 2 * ((x - 3) ** 2) + 2 * (y - 1.5) ** 2)
        + 3 / (1 + 0.5 * ((x - 3.5) ** 2) + 0.5 * (y - 4) ** 2)
    )
    + 10
)

xx, yy = np.meshgrid(np.linspace(0, 5, 100), np.linspace(0, 5, 100))

surface = go.Surface(
    z=sp.lambdify((x, y), multi, "numpy")(xx, yy),
    x=xx,
    y=yy,
    colorscale="BrBg_r",
    contours=dict(x=dict(show=True), y=dict(show=True), z=dict(show=True)),
    colorbar=dict(title="z"),
    name="multi function",
)

fig = go.Figure(surface)
fig.update_layout(
    title="Function with multiple minima",
    autosize=False,
    width=600,
    height=600,
    scene_aspectmode="cube",
    margin=dict(l=10, r=10, b=10, t=30),
    scene_camera=dict(
        eye=dict(x=-1.2, y=1.8, z=1.25),
    ),
)
fig.show()

In this case the choice of the initial values is very important.

[30]:
gradient_descent_animation(
    multi,
    (x, y),
    initial=np.array([1, 3]),
    steps=50,
    learning_rate=0.2,
    lim=5,
    cmap="BrBG_r",
)

Let’s start just 0.2 to right along the x-axis and we get to a better minima, and yet not the global minimum.

[31]:
gradient_descent_animation(
    multi,
    (x, y),
    initial=np.array([1.2, 3]),
    steps=50,
    learning_rate=0.2,
    lim=5,
    cmap="BrBG_r",
)

If we start a bit more to the left and we get to the global minimum.

[32]:
gradient_descent_animation(
    multi,
    (x, y),
    initial=np.array([0.6, 3]),
    steps=50,
    learning_rate=0.2,
    lim=5,
    cmap="BrBG_r",
)

Optimizing neural networks (week 3)#

Single-neuron network with linear activation and Mean Squared Error (MSE) loss function#

Let the linear model \(Z = wX + b\), where

\(X\) is a \((k, m)\) matrix of \(k\) features for \(m\) samples,

\(w\) is a \((1, k)\) matrix (row vector) containing \(k\) weights,

\(b\) is a \((1, 1)\) matrix (scalar) containing 1 bias, such that

\(Z = \begin{bmatrix}w_1&&w_2&&\dots w_k\end{bmatrix} \begin{bmatrix}x_{11}&&x_{12}&&\dots&&x_{1m}\\x_{21}&&x_{22}&&\dots&&x_{2m}\\\vdots&&\vdots&&\ddots&&\vdots\\x_{k1}&&x_{k2}&&\dots&&x_{km} \end{bmatrix} + \begin{bmatrix}b\end{bmatrix}\)

\(Z\) is a \((1, m)\) matrix.

Let \(Y\) a \((1, m)\) matrix (row vector) containing the labels of \(m\) samples, such that

\(Y = \begin{bmatrix}y_1&&y_2&&\dots&&y_m\end{bmatrix}\)

[33]:
m = 40
k = 2
w = np.array([[0.37, 0.57]])
b = np.array([[0.1]])

rng = np.random.default_rng(4)
X = rng.standard_normal((k, m))
Y = w @ X + b + rng.normal(size=(1, m))

scatter = go.Scatter3d(
    z=Y.squeeze(),
    x=X[0],
    y=X[1],
    mode="markers",
    marker=dict(color="#1f77b4", size=5),
    name="data",
)

fig = go.Figure(scatter)
fig.update_layout(
    title="Regression data",
    autosize=False,
    width=600,
    height=600,
    scene_aspectmode="cube",
    margin=dict(l=10, r=10, b=10, t=30),
    scene=dict(
        xaxis=dict(title="x1", range=[-2.5, 2.5]),
        yaxis=dict(title="x2", range=[-2.5, 2.5]),
        zaxis_title="y",
        camera_eye=dict(x=1.2, y=-1.8, z=0.5),
    ),
)
fig.show()

The predictions \(\hat{Y}\) are the result of passing \(Z\) to a linear activation function, so that \(\hat{Y} = I(Z)\).

[34]:
def init_neuron_params(k):
    w = rng.uniform(size=(1, k)) * 0.5
    b = np.zeros((1, 1))
    return {"w": w, "b": b}


xx1, xx2 = np.meshgrid(np.linspace(-2.5, 2.5, 100), np.linspace(-2.5, 2.5, 100))
w, b = init_neuron_params(k).values()
random_model_plane = go.Surface(
    z=linear(w[0, 0] * xx1 + w[0, 1] * xx2 + b, m=1),
    x=xx1,
    y=xx2,
    colorscale=[[0, "#FF8920"], [1, "#FF8920"]],
    showscale=False,
    opacity=0.5,
    name="init params",
)
fig.add_trace(random_model_plane)
fig.update_layout(title="Random model")
fig.show()

For a single sample the squared loss is

\(\ell(w, b, y_i) = (y_i - \hat{y}_i)^2 = (y_i - wx_i - b)^2\)

For the whole sample the mean squared loss is

\(\mathcal{L}(w, b, Y) = \cfrac{1}{m}\sum \limits_{i=1}^{m} \ell(w, b, y_i) = \cfrac{1}{m}\sum \limits_{i=1}^{m} (y_i - wx_i - b)^2\)

So that we don’t have a lingering 2 in the partial derivatives, we can rescale it by 0.5

\(\mathcal{L}(w, b, Y) = \cfrac{1}{2m}\sum \limits_{i=1}^{m} (y_i - wx_i - b)^2\)

[35]:
def neuron_output(X, params, activation, *args):
    w = params["w"]
    b = params["b"]
    Z = w @ X + b
    Y_hat = activation(Z, *args)
    return Y_hat


def compute_mse_loss(Y, Y_hat):
    m = Y_hat.shape[1]
    return np.sum((Y - Y_hat) ** 2) / (2 * m)


params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss of random model: {compute_mse_loss(Y, Y_hat):.2f}")
MSE loss of random model: 0.49

The partial derivatives of the loss function are

\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial w}\)

\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial b}\)

Let’s calculate \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\), \(\cfrac{\partial \hat{Y}}{\partial w}\) and \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\)

\(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}} = \cfrac{\partial}{\partial \hat{Y}} \cfrac{1}{m}\sum \limits_{i=1}^{m} (Y - \hat{Y})^2 = \cfrac{1}{m}\sum \limits_{i=1}^{m} 2(Y - \hat{Y})(- 1) = -\cfrac{1}{m}\sum \limits_{i=1}^{m}(Y - \hat{Y})\)

\(\cfrac{\partial \hat{Y}}{\partial w} = \cfrac{\partial}{\partial w} wX + b = X\)

\(\cfrac{\partial \hat{Y}}{\partial b} = wX + b = X = 1\)

Let’s put it all together

\(\cfrac{\partial \mathcal{L}}{\partial w} = -\cfrac{1}{m}\sum \limits_{i=1}^{m} (Y - \hat{Y}) \cdot X^T\)

\(\cfrac{\partial \mathcal{L}}{\partial b} = -\cfrac{1}{m}\sum \limits_{i=1}^{m} (Y - \hat{Y})\)

🔑 \(\cfrac{\partial \mathcal{L}}{\partial w}\) contains all the partial derivative wrt to each of the \(k\) elements of \(w\); it’s a \((k, m)\) matrix, because the dot product is between a matrix of shape \((1, m)\) and \(X^T\) which is \((m, k)\).

[36]:
def compute_grads(X, Y, Y_hat):
    m = Y_hat.shape[1]
    dw = -1 / m * np.dot(Y - Y_hat, X.T)  # (1, k)
    db = -1 / m * np.sum(Y - Y_hat, axis=1, keepdims=True)  # (1, 1)
    return {"w": dw, "b": db}


def update_params(params, grads, learning_rate=0.1):
    params = params.copy()
    for k in grads.keys():
        params[k] = params[k] - learning_rate * grads[k]
    return params


params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss before update: {compute_mse_loss(Y, Y_hat):.2f}")
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, linear, 1)
print(f"MSE loss after update: {compute_mse_loss(Y, Y_hat):.2f}")
MSE loss before update: 0.50
MSE loss after update: 0.48

Let’s find the best parameters with gradient descent.

[37]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, linear, 1)
loss = compute_mse_loss(Y, Y_hat)
print(f"Iter 0  - MSE loss={loss:.6f}")
for i in range(1, 50 + 1):
    grads = compute_grads(X, Y, Y_hat)
    params = update_params(params, grads)
    Y_hat = neuron_output(X, params, linear, 1)
    loss_new = compute_mse_loss(Y, Y_hat)
    if loss - loss_new <= 1e-4:
        print(f"Iter {i} - MSE loss={loss:.6f}")
        print("The algorithm has converged")
        break
    loss = loss_new
    if i % 5 == 0:
        print(f"Iter {i} - MSE loss={loss:.6f}")
Iter 0  - MSE loss=0.439887
Iter 5 - MSE loss=0.420053
Iter 10 - MSE loss=0.412665
Iter 15 - MSE loss=0.409886
Iter 20 - MSE loss=0.408831
Iter 22 - MSE loss=0.408717
The algorithm has converged

Let’s visualize the final model.

[38]:
w, b = params.values()
final_model_plane = go.Surface(
    z=w[0, 0] * xx1 + w[0, 1] * xx2 + b,
    x=xx1,
    y=xx2,
    colorscale=[[0, "#2ca02c"], [1, "#2ca02c"]],
    showscale=False,
    opacity=0.5,
    name="final params",
)
fig.add_trace(final_model_plane)
fig.data[1].visible = False
fig.update_layout(title="Final model")
fig.show()

Single-neuron network with sigmoid activation and Log loss function#

As before, let the linear model \(Z = wX + b\) and \(Y\) a row vector containing the labels of \(m\) samples.

[39]:
m = 40
k = 2
neg_centroid = [-1, -1]
pos_centroid = [1, 1]

rng = np.random.default_rng(1)
X = np.r_[
    rng.standard_normal((m // 2, k)) + neg_centroid,
    rng.standard_normal((m // 2, k)) + pos_centroid,
].T
Y = np.array([[0] * (m // 2) + [1] * (m // 2)])

plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == 0, "tab:orange", "tab:blue"))
plt.gca().set_aspect("equal")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.title("Classification data")
plt.show()
../_images/notebooks_calculus_108_0.png

This time, though, the predictions \(\hat{Y}\) are the result of passing \(Z\) to a sigmoid function, so that \(\hat{Y} = \sigma(Z)\).

\(\sigma(Z) = \cfrac{1}{1+e^{-Z}}\)

To visualize the sigmoid function let’s add another axis.

[40]:
neg_scatter = go.Scatter3d(
    z=np.full(int(m / 2), 0),
    x=X[0, : int(m / 2)],
    y=X[1, : int(m / 2)],
    mode="markers",
    marker=dict(color="#ff7f0e", size=5),
    name="negative class",
)
pos_scatter = go.Scatter3d(
    z=np.full(int(m / 2), 1),
    x=X[0, int(m / 2) :],
    y=X[1, int(m / 2) :],
    mode="markers",
    marker=dict(color="#1f77b4", size=5),
    name="positive class",
)

fig = go.Figure([pos_scatter, neg_scatter])
fig.update_layout(
    title="Classification data",
    autosize=False,
    width=600,
    height=600,
    margin=dict(l=10, r=10, b=10, t=30),
    scene=dict(
        xaxis=dict(title="x1", range=[-5, 5]),
        yaxis=dict(title="x2", range=[-5, 5]),
        zaxis_title="y",
        camera_eye=dict(x=0, y=0.3, z=2.5),
        camera_up=dict(x=0, y=np.sin(np.pi), z=0),
    ),
)

Now, let’s plot the predictions of a randomly initialized model.

[41]:
xx1, xx2 = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))
w, b = init_neuron_params(k).values()
random_model_plane = go.Surface(
    z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),
    x=xx1,
    y=xx2,
    colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
    showscale=False,
    opacity=0.5,
    name="init params",
)
fig.add_trace(random_model_plane)
fig.update_layout(
    title="Random model",
)
fig.show()

It turns out that the output of the sigmoid activation is the probability \(p\) of a sample belonging to the positive class, which implies that \((1-p)\) is the probability of a sample belonging to the negative class.

Intuitively, a loss function will be small when \(p_i\) is close to 1.0 and \(y_i = 1\) and when \(p_i\) is close to 0.0 and \(y_i = 0\).

For a single sample this loss function might look like this

\(\ell(w, b, y_i) = -p_i^{y_i}(1-p_i)^{1-y_i}\)

For the whole sample the loss would be

\(\mathcal{L}(w, b, Y) = -\prod \limits_{i=1}^{m} p_i^{y_i}(1-p_i)^{1-y_i}\)

In Univariate optimization (week 1) we’ve seen how it’s easier to calculate the derivative of the logarithm of the PMF of the Binomial Distribution.

We can do the same here to obtain a more manageable loss function.

\(\mathcal{L}(w, b, Y) = -\sum \limits_{i=1}^{m} y_i \ln p_i + (1-y_i) \ln (1-p_i)\)

It turns out it’s standard practice to minimize a function and to average the loss over the sample (to manage the scale of the loss for large datasets), so we’ll use this instead:

\(\mathcal{L}(w, b, Y) = -\cfrac{1}{m} \sum \limits_{i=1}^{m} y_i \ln p_i + (1-y_i) \ln (1-p_i)\)

Finally, let’s substitute back \(\hat{y}_i\) for \(p_i\).

\(\mathcal{L}(w, b, Y) = -\cfrac{1}{m} \sum \limits_{i=1}^{m} y_i \ln \hat{y}_i + (1-y_i) \ln (1-\hat{y}_i)\)

Recall that \(\hat{y}_i = \sigma(z_i)\).

[42]:
def compute_log_loss(Y, Y_hat):
    m = Y_hat.shape[1]
    loss = (-1 / m) * (
        np.dot(Y, np.log(Y_hat).T) + np.dot((1 - Y), np.log(1 - Y_hat).T)
    )
    return loss.squeeze()


params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss of random model: {compute_log_loss(Y, Y_hat):.2f}")
Log loss of random model: 0.50

The partial derivatives of the loss function are

\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial w}\)

\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial b}\)

Because we have an activation function around \(Z\), we have another chain rule to apply.

For the MSE loss, the activation was an identity function so it didn’t pose the same challenge.

\(\cfrac{\partial \mathcal{L}}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial w} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial Z}\cfrac{\partial Z}{\partial w}\)

\(\cfrac{\partial \mathcal{L}}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial b} = \cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\cfrac{\partial \sigma(Z)}{\partial Z}\cfrac{\partial Z}{\partial b}\)

Let’s calculate \(\cfrac{\partial \mathcal{L}}{\partial \sigma(Z)}\), \(\cfrac{\partial \sigma(Z)}{\partial Z}\), \(\cfrac{\partial Z}{\partial w}\) and \(\cfrac{\partial Z}{\partial b}\)

Calculation of partial derivative of Log loss wrt sigmoid activation#

\(\cfrac{\partial \mathcal{L}}{\partial \sigma(Z)} = \cfrac{\partial}{\partial \sigma(Z)} -\cfrac{1}{m} \sum \limits_{i=1}^{m} Y\ln\sigma(Z)+(Y-1)\ln(1-\sigma(Z))\)

\(= -\cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{Y}{\sigma(Z)}+\cfrac{Y-1}{1-\sigma(Z)}\)

\(= -\cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{Y(1-\sigma(Z)) + (Y-1)\sigma(Z)}{\sigma(Z)(1-\sigma(Z))}\)

\(= -\cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))}\)

Calculation of the derivative of the sigmoid function#

\(\cfrac{\partial \sigma(Z)}{\partial Z} = \cfrac{\partial}{\partial Z} \cfrac{1}{1+e^{-Z}}\)

\(= (1+e^{-Z})^{-1}\)

\(= -(1+e^{-Z})^{-2}(-e^{-Z})\)

\(= (1+e^{-Z})^{-2}e^{-Z}\)

\(= \cfrac{e^{-Z}}{(1+e^{-Z})^2}\)

\(= \cfrac{1 +e^{-Z} - 1}{(1+e^{-Z})^2}\)

\(= \cfrac{1 +e^{-Z}}{(1+e^{-Z})^2}-\cfrac{1}{(1+e^{-Z})^2}\)

\(= \cfrac{1}{1+e^{-Z}}-\cfrac{1}{(1+e^{-Z})^2}\)

Because the factor of \(x-x^2\) is \(x(1-x)\) we can factor it to

\(= \cfrac{1}{1+e^{-Z}}\left(1 - \cfrac{1}{1+e^{-Z}} \right)\)

\(= \sigma(Z)(1 - \sigma(Z))\)

The other two partial derivatives are the same as the MSE loss’.

\(\cfrac{\partial Z}{\partial w} = \cfrac{\partial}{\partial w} wX+b = X\)

\(\cfrac{\partial Z}{\partial b} = \cfrac{\partial}{\partial b} wX+b = 1\)

Let’s put it all together

\(\cfrac{\partial \mathcal{L}}{\partial w} = -\cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))} \sigma(Z)(1 - \sigma(Z)) X^T = -\cfrac{1}{m} \sum \limits_{i=1}^{m} (Y -\sigma(Z))X^T = -\cfrac{1}{m} \sum \limits_{i=1}^{m} (Y - \hat{Y}) X^T\)

\(\cfrac{\partial \mathcal{L}}{\partial b} = -\cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{Y -\sigma(Z)}{\sigma(Z)(1-\sigma(Z))} \sigma(Z)(1 - \sigma(Z)) = -\cfrac{1}{m} \sum \limits_{i=1}^{m} Y - \hat{Y}\)

🔑 The partial derivatives of the Log loss \(\cfrac{\partial \mathcal{L}}{\partial w}\) and \(\cfrac{\partial \mathcal{L}}{\partial b}\) are the same as the MSE loss’.

[43]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss before update: {compute_log_loss(Y, Y_hat):.2f}")
grads = compute_grads(X, Y, Y_hat)
params = update_params(params, grads)
Y_hat = neuron_output(X, params, sigmoid)
print(f"Log loss after update: {compute_log_loss(Y, Y_hat):.2f}")
Log loss before update: 0.53
Log loss after update: 0.51

Let’s find the best parameters using gradient descent.

[44]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
loss = compute_log_loss(Y, Y_hat)
print(f"Iter 0  - Log loss={loss:.6f}")
for i in range(1, 500 + 1):
    grads = compute_grads(X, Y, Y_hat)
    params = update_params(params, grads)
    Y_hat = neuron_output(X, params, sigmoid)
    loss_new = compute_log_loss(Y, Y_hat)
    if loss - loss_new <= 1e-4:
        print(f"Iter {i} - Log loss={loss:.6f}")
        print("The algorithm has converged")
        break
    loss = loss_new
    if i % 100 == 0:
        print(f"Iter {i} - Log loss={loss:.6f}")
Iter 0  - Log loss=0.582076
Iter 100 - Log loss=0.182397
Iter 200 - Log loss=0.148754
Iter 300 - Log loss=0.134595
Iter 306 - Log loss=0.134088
The algorithm has converged

Let’s visualize the final model.

[45]:
w, b = params.values()
final_model_plane = go.Surface(
    z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),
    x=xx1,
    y=xx2,
    colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
    showscale=False,
    opacity=0.5,
    name="final params",
)
fig.add_trace(final_model_plane)
fig.data[2].visible = False
fig.update_layout(title="Final model")
fig.show()

Neural network with 1 hidden layer of 3 neurons and sigmoid activations#

Motivation#

Let’s create non-linear classification data to see the shortcomings of the previous model.

[46]:
m = 100
k = 2
linspace_out = np.linspace(0, 2 * np.pi, m // 2, endpoint=False)
linspace_in = np.linspace(0, 2 * np.pi, m // 2, endpoint=False)
outer_circ_x = np.cos(linspace_out)
outer_circ_y = np.sin(linspace_out)
inner_circ_x = np.cos(linspace_in) * 0.3
inner_circ_y = np.sin(linspace_in) * 0.3

rng = np.random.default_rng(1)
X = np.vstack(
    [np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]
)
X += rng.normal(scale=0.1, size=X.shape)
X = (X - np.mean(X, axis=1, keepdims=True)) / np.std(X, axis=1, keepdims=True)
Y = np.array([[0] * (m // 2) + [1] * (m // 2)])

plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == 0, "tab:orange", "tab:blue"))
plt.gca().set_aspect("equal")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.xlim(-3, 3)
plt.ylim(-3, 3)
plt.title("Non-linear classification data")
plt.show()
../_images/notebooks_calculus_129_0.png

As before let’s add another axis to visualize the model.

[47]:
neg_scatter = go.Scatter3d(
    z=np.full(int(m / 2), 0),
    x=X[0, : int(m / 2)],
    y=X[1, : int(m / 2)],
    mode="markers",
    marker=dict(color="#ff7f0e", size=5),
    name="negative class",
)
pos_scatter = go.Scatter3d(
    z=np.full(int(m / 2), 1),
    x=X[0, int(m / 2) :],
    y=X[1, int(m / 2) :],
    mode="markers",
    marker=dict(color="#1f77b4", size=5),
    name="positive class",
)

fig = go.Figure([pos_scatter, neg_scatter])
fig.update_layout(
    title="Non-linear classification data",
    autosize=False,
    width=600,
    height=600,
    margin=dict(l=10, r=10, b=10, t=30),
    scene=dict(
        xaxis=dict(title="x1", range=[-3, 3]),
        yaxis=dict(title="x2", range=[-3, 3]),
        zaxis_title="y",
        aspectmode="cube",
        camera_eye=dict(x=0, y=0, z=2.5),
        camera_up=dict(x=0, y=np.sin(np.pi), z=0),
    ),
)

Let’s train a single-neuron model with sigmoid activation using a Log loss function and plot the respective plane.

[48]:
params = init_neuron_params(k)
Y_hat = neuron_output(X, params, sigmoid)
loss = compute_log_loss(Y, Y_hat)
print(f"Iter 0  - Log loss={loss:.6f}")
for i in range(1, 500 + 1):
    grads = compute_grads(X, Y, Y_hat)
    params = update_params(params, grads)
    Y_hat = neuron_output(X, params, sigmoid)
    loss_new = compute_log_loss(Y, Y_hat)
    if loss - loss_new <= 1e-4:
        print(f"Iter {i} - Log loss={loss:.6f}")
        print("The algorithm has converged")
        break
    loss = loss_new
    if i % 100 == 0:
        print(f"Iter {i} - Log loss={loss:.6f}")

w, b = params.values()
xx1, xx2 = np.meshgrid(np.linspace(-2, 2, 100), np.linspace(-2, 2, 100))
final_model_plane = go.Surface(
    z=sigmoid(w[0, 0] * xx1 + w[0, 1] * xx2 + b),
    x=xx1,
    y=xx2,
    colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
    showscale=False,
    opacity=0.5,
    name="final params",
)
fig.add_trace(final_model_plane)
fig.update_layout(
    title="Linear model on non-linear data",
)
fig.show()
Iter 0  - Log loss=0.732818
Iter 62 - Log loss=0.695059
The algorithm has converged

Let’s try to improve it by adding more neurons.

Neural network notation#

The neural network we will be building is shown below.

8aa84e2c8eca4e64825a4de98cf2817d

A note on the indices of the weights:

For example, \(w_{121}\) (\(w_{\text{AT },\text{FROM },\text{TO}}\)) means “weight AT layer 1, FROM node 2 (previous layer) TO node 1 (current layer)”.

Let \(Z_1\) the \((n, m)\) pre-activation matrix of the first and only hidden layer.

\(Z_1 = \begin{bmatrix} w_{111}x_{11}+w_{121}x_{21}+b_{1}&& w_{111}x_{12}+w_{121}x_{22}+b_{1}&& \cdots&& w_{111}x_{1m}+w_{121}x_{2m}+b_{1}\\ w_{112}x_{11}+w_{122}x_{21}+b_{1}&& w_{112}x_{12}+w_{122}x_{22}+b_{1}&& \cdots&& w_{112}x_{1m}+w_{122}x_{2m}+b_{1}\\ w_{113}x_{11}+w_{123}x_{21}+b_{1}&& w_{113}x_{12}+w_{123}x_{22}+b_{1}&& \cdots&& w_{113}x_{1m}+w_{123}x_{2m}+b_{1} \end{bmatrix}\)

A more compact matrix notation is shown below.

\(Z_1 = \begin{bmatrix} w_{111}&&w_{121}\\ w_{112}&&w_{122}\\ w_{113}&&w_{123}\\ \end{bmatrix} \begin{bmatrix} x_{11}&&x_{12}&&\cdots&&x_{1m}\\ x_{21}&&x_{22}&&\cdots&&x_{2m} \end{bmatrix} + \begin{bmatrix}b_{1}\end{bmatrix}\)

Let \(A_1 = h(Z_1)\) where \(h\) is an activation function (eg linear, sigmoid). For the hidden layers it’s common to use other a ReLU activation function.

Let \(Z_2\) a \((3, m)\) pre-activation matrix of the output layer.

\(Z_2 = \begin{bmatrix}w_{211}&&w_{221}&&w_{231}\end{bmatrix} \begin{bmatrix} a_{111}&&a_{112}&&\dots&&a_{11m}\\ a_{121}&&a_{122}&&\dots&&a_{12m}\\ a_{131}&&a_{132}&&\dots&&a_{13m} \end{bmatrix} + \begin{bmatrix}b_{2}\end{bmatrix}\)

\(\hat{Y} = \sigma(Z_2)\)

Note that we could use \(a_{0km}\) instead of \(x_{km}\) if we want to define a generic \(Z_{LAYER}\) matrix, but it’s better to keep \(x\) and \(a\) distinct although it can help thinking that \(x\) is basically just \(a_0\).

Partial derivatives of a neural network#

In our network we need 5 steps to go from the inputs through the hidden layer to the loss function.

For 1 sample (omitting the \(i\) index for conciseness):

  1. calculate the linear combinations \(z_{1n}\) for all \(n\) neurons in the first hidden layer.

\(z_{1n} = w_{11n}x_1 + w_{12n}x_2 + b_1\)

  1. pass them to the sigmoid activation function

\(a_{1n} = \sigma(z_{1n})\)

  1. calculate the linear combination \(z_{21}\) for the final output node

\(z_{21} = w_{211}a_{11} + w_{221}a_{12} + w_{231}a_{13} + b_2\)

  1. pass it to the sigmoid activation function

\(\hat{Y} = \sigma(z_{21})\)

  1. compute the Log loss

\(\mathcal{L}(w, b, Y) = -y \ln \hat{y} + (1-y) \ln (1-\hat{y})\)

It turns out that to work out the chain rule of the partial derivative of the loss wrt to a parameter of an input node we just need to chain the partial derivatives of these 5 functions.

For example, let’s workout the chain rule for \(\cfrac{\partial \mathcal{L}}{\partial w_{111}}\)

Moving backwards, the first derivative we encounter is at 5. \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\), then at 4. \(\cfrac{\partial \hat{Y}}{\partial z_{21}}\), then at 3. \(\cfrac{\partial z_{21}}{\partial a_{11}}\), then at 2. \(\cfrac{\partial a_{11}}{\partial z_{11}}\) and finally at 1. \(\cfrac{\partial z_{11}}{\partial w_{111}}\)

\(\cfrac{\partial \mathcal{L}}{\partial w_{111}} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}} \cfrac{\partial \hat{Y}}{\partial z_{21}} \cfrac{\partial z_{21}}{\partial a_{11}} \cfrac{\partial a_{11}}{\partial z_{11}} \cfrac{\partial z_{11}}{\partial w_{111}}\)

From Calculation of partial derivative of Log loss wrt sigmoid activation we know

\(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}} = -\cfrac{Y - \hat{Y}}{\hat{Y}(1-\hat{Y})}\)

From Calculation of the derivative of the sigmoid function we know

\(\cfrac{\partial \hat{Y}}{\partial z_{21}} = \hat{Y}(1-\hat{Y})\)

And we know that these two simplify to

\(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial z_{21}} = - (Y - \hat{Y})\)

Similarly to \(\cfrac{\partial \hat{Y}}{\partial z_{21}}\) (partial derivative of the sigmoid function in the output node wrt its argument) we obtain \(\cfrac{\partial a_{11}}{\partial z_{11}}\) (partial derivative of the sigmoid function in the first node of the hidden layer wrt its argument)

\(\cfrac{\partial a_{11}}{\partial z_{11}} = a_{11}(1-a_{11})\)

Then the others are very easy to calculate.

\(\cfrac{\partial z_{21}}{\partial a_{11}} = \cfrac{\partial}{\partial a_{11}} w_{211}a_{11} + w_{221}a_{12} + w_{231}a_{13} + b_2 = w_{211}\)

\(\cfrac{\partial z_{11}}{\partial w_{111}} = \cfrac{\partial}{\partial w_{111}} w_{111}x_1 + w_{121}x_2 + b_1 = x_1\)

Let’s put it all together.

\(\cfrac{\partial \mathcal{L}}{\partial w_{111}} = - (Y - \hat{Y})w_{211}a_{11}(1-a_{11})x_1\)

The partial derivative of the loss wrt of the weights in the hidden layer are the same as when we had a single-neuron neural network.

As an example, let’s calculate \(\cfrac{\partial \mathcal{L}}{\partial w_{231}}\).

\(\cfrac{\partial \mathcal{L}}{\partial w_{231}} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}} \cfrac{\partial \hat{Y}}{\partial z_{21}} \cfrac{\partial z_{21}}{\partial w_{231}}\)

The only one we need to calculate is

\(\cfrac{\partial z_{21}}{\partial w_{231}} = \cfrac{\partial}{\partial w_{231}}w_{211}a_{11} + w_{221}a_{12} + w_{231}a_{13} + b_2 = a_{13}\)

Let’s put it all together.

\(\cfrac{\partial \mathcal{L}}{\partial w_{231}} = - (Y - \hat{Y})a_{13}\)

Forward and backward propagation#

It turns out there is an algorithm to efficiently train a neural network.

🔑 Forward propagation refers to the calculation and storage of output layers for backward propagation

🔑 Backward propagation (aka backprop) refers to the calculation of partial derivatives for each layer where hidden layers both output the partial derivative of the loss wrt to the parameters used in the layer but also the partial derivative of the loss wrt the output of the previous layer

Forward propagation doesn’t need a lot of explanation. A key aspect, though, is the storage of the outputs.

Backward propagation is also quite intuitive at the very high level as that’s sort of how, for example, we naturally approached the chain rule of \(\cfrac{\partial \mathcal{L}}{\partial w_{111}}\).

Recall we moved backwards starting from the partial derivative of the loss wrt to \(\hat{Y}\) and we ended up with

\(\cfrac{\partial \mathcal{L}}{\partial w_{111}} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}} \cfrac{\partial \hat{Y}}{\partial z_{21}} \cfrac{\partial z_{21}}{\partial a_{11}} \cfrac{\partial a_{11}}{\partial z_{11}} \cfrac{\partial z_{11}}{\partial w_{111}}\)

The forward and backward propagation algorithm is shown below.

4ad7ea14f66a4438aa62048970873f00

Note how the partial derivative of the loss wrt the parameters is built recursively and carried over from one layer to the next (or previous depends on how you look at it).

The gray bit of the chain rule is what’s been carried over and the black bit is what’s been appended at each step.

Also note how the process “drops” the partial derivative of the loss wrt the parameters for the final layer as it goes, while it keeps building the partial derivative for the parameters used in the first layer.

That’s the main idea behind its efficiency.

The backward propagation for a network like the one we will be building will look like this.

Step 1

Inputs: \(Y\), \(\hat{Y}\)

compute \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}} = -\cfrac{Y - \hat{Y}}{\hat{Y}(1-\hat{Y})}\)

Outputs: \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\) (carry over to the next step)

Step 2

Inputs: \(\cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\) (from previous step), \(A_1\), \(W_2\), \(Z_2\) (from the layer’s cache)

compute \(\cfrac{\partial \hat{Y}}{\partial Z_{2}} = \cfrac{\partial}{\partial Z_{2}}\sigma(Z_2)\)

compute \(\cfrac{\partial \mathcal{L}}{\partial Z_{2}} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial \hat{Y}}{\partial Z_{2}}\)

compute \(\cfrac{\partial \mathcal{L}}{\partial A_{1}} = W_2^T \cdot \cfrac{\partial \mathcal{L}}{\partial Z_{2}}\)

compute \(\cfrac{\partial \mathcal{L}}{\partial W_{2}} = \cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{\partial \mathcal{L}}{\partial Z_{2}} \cdot A_1^T\)

compute \(\cfrac{\partial \mathcal{L}}{\partial b_{2}} = \cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{\partial \mathcal{L}}{\partial Z_{2}}\)

Outputs: \(\cfrac{\partial \mathcal{L}}{\partial A_{1}}\) (carry over to the next step), \(\cfrac{\partial \mathcal{L}}{\partial W_{2}}\), \(\cfrac{\partial \mathcal{L}}{\partial b_{2}}\) (send to update params process)

Step 3

Inputs: \(\cfrac{\partial \mathcal{L}}{\partial A_{1}}\) (from the previous step), \(X\), \(W_1\), \(Z_1\) (from the layer’s cache)

compute \(\cfrac{\partial A_{1}}{\partial Z_{1}} = \cfrac{\partial}{\partial Z_{1}}\text{ReLU}(Z_1)\)

compute \(\cfrac{\partial \mathcal{L}}{\partial Z_{1}} = \cfrac{\partial \mathcal{L}}{\partial \hat{Y}}\cfrac{\partial A_{1}}{\partial Z_{1}}\)

compute \(\cfrac{\partial \mathcal{L}}{\partial X} = W_1^T \cdot \cfrac{\partial \mathcal{L}}{\partial Z_{1}}\)

compute \(\cfrac{\partial \mathcal{L}}{\partial W_{1}} = \cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{\partial \mathcal{L}}{\partial Z_{1}} \cdot X^T\)

compute \(\cfrac{\partial \mathcal{L}}{\partial b_{1}} = \cfrac{1}{m} \sum \limits_{i=1}^{m} \cfrac{\partial \mathcal{L}}{\partial Z_{1}}\)

Outputs: \(\cfrac{\partial \mathcal{L}}{\partial A_{0}}\) (do not use further), \(\cfrac{\partial \mathcal{L}}{\partial W_{1}}\), \(\cfrac{\partial \mathcal{L}}{\partial b_{1}}\) (send to update params process)

More generally, the pseudo-code might be something along these lines.

\(\begin{array}{l} \textbf{Algorithm: } \text{Forward and Backward Propagation} \\ \textbf{Input: } X, Y, \text{initial parameter} [W_1, b_1, \dots, W_L, b_L] \\ \textbf{Output: } \text{gradients} [\nabla W_1, \nabla b_1, \dots, \nabla W_L, \nabla b_L] \\ \phantom{0}1 : L \gets \text{len}([W_1, b_1, \dots, W_L, b_L]) / 2 \\ \phantom{0}2 : A_{i-1} \gets X \\ \phantom{0}3 : \text{Forward propagation:} \\ \phantom{0}4 : \text{for i = 1 to L do} \\ \phantom{0}5 : \quad \text{if } i < L \text{ then} \\ \phantom{0}6 : \quad \quad \mathcal{f}_i \gets ReLU \\ \phantom{0}7 : \quad \text{else } \\ \phantom{0}8 : \quad \quad \mathcal{f}_i \gets \sigma \\ \phantom{0}9 : \quad \text {end if} \\ 10 : \quad Z_i \gets W_iA_{i-1} + b_i \\ 11 : \quad A_i \gets \mathcal{f}_i(Z_i) \\ 12 : \quad \text{cache}_i \gets [A_{i-1}, \mathcal{f}_i, W_i, Z_i] \\ 13 : \quad \text{if } i = L \text{ then} \\ 14 : \quad \quad \hat{Y} \gets A_L \\ 15 : \quad \text {end if} \\ 16 : \text{end for} \\ 17 : \text{Backward propagation:} \\ 18 : dA_L \gets -(Y - \hat{Y}) / \hat{Y}(1-\hat{Y}) \\ 19 : \text{for i = L down to 1 do} \\ 20 : \quad [A_{i-1}, \mathcal{f}_i, W_i, Z_i] \gets \text{unpack } \text{cache}_i \\ 21 : \quad dZ_i = dA_i \mathcal{f'}_i(Z_i) \\ 22 : \quad \nabla W_i = (1/m) \sum_{i=1}^{m} dZ_i A_{i-1}^T \\ 23 : \quad \nabla b_i = (1/m) \sum_{i=1}^{m} dZ_i \\ 24 : \quad dA_{i-1} = W_i^T dZ_i \\ 25 : \text{end for} \\ 26 : \text{return } [\nabla W_1, \nabla b_1, \dots, \nabla W_L, \nabla b_L] \\ \end{array}\)

Building a neural network#

The first step is to randomly initialize all the parameters.

A common initialization of weights in neural networks is the Glorot Uniform, which takes into account the size of the previous layer and the size of the current layer.

\(W_{i} \sim U\left(-\cfrac{\sqrt{6}}{\sqrt{fan_{i-1} + fan_{i}}}, \cfrac{\sqrt{6}}{\sqrt{fan_{i-1} + fan_{i}}}\right)\), where \(U(a, b)\) is the Uniform distribution and \(fan_{i-1}\) is the size of the previous layer and \(fan_i\) is the size of the current layer.

[49]:
def init_nn_params(n):
    params = {}
    for i in range(len(n) - 1):
        fan_in, fan_out = n[i : i + 2]
        limit = np.sqrt(6 / (fan_in + fan_out))
        params["W" + str(i + 1)] = rng.uniform(-limit, limit, size=(fan_out, fan_in))
        params["b" + str(i + 1)] = np.zeros((n[i + 1], 1))
    return params


params = init_nn_params([X.shape[0], 3, Y.shape[0]])
params
[49]:
{'W1': array([[ 0.223084  , -0.4653109 ],
        [ 0.61949715, -0.54494543],
        [-0.93066582,  1.01408548]]),
 'b1': array([[0.],
        [0.],
        [0.]]),
 'W2': array([[0.09800704, 0.67090127, 0.07158097]]),
 'b2': array([[0.]])}

Then, we do forward propagation and compute the loss.

[50]:
def step_forward_prop(A_prev, layer, params, activation):
    W = params["W" + str(layer)]
    b = params["b" + str(layer)]
    Z = W @ A_prev + b
    A = activation(Z)
    cache = A_prev, W, Z, activation.__name__
    return A, cache


def forward_prop(X, params, hidden_activation=relu, ouput_activation=sigmoid):
    A_prev = X
    caches = []
    n_layers = len(params) // 2
    for layer in range(1, n_layers + 1):
        if layer <= n_layers - 1:
            activation = hidden_activation
        else:
            activation = ouput_activation
        A, cache = step_forward_prop(A_prev, layer, params, activation)
        A_prev = A
        caches.append(cache)
    return A, caches


Y_hat, caches = forward_prop(X, params)
print(f"Output layer is {caches[-1][2].shape} with {caches[-1][3]} activation")
for layer in range(len(caches) - 2, -1, -1):
    print(
        f"Hidden layer {layer + 1} is {caches[layer][2].shape} with {caches[layer][3]} activation"
    )
print(f"\nLog loss of random model: {compute_log_loss(Y, Y_hat):.2f}")
Output layer is (1, 100) with sigmoid activation
Hidden layer 1 is (3, 100) with relu activation

Log loss of random model: 0.79

Then, we do backward propagation.

In Proof that ReLU is not differentiable at 0 we said that the convention in machine learning is to consider its derivative at 0, well, 0.

Let’s see how just one undetermined derivative can wreak havoc on the parameters gradients during backward propagation.

[51]:
dA2 = -(Y - Y_hat) / np.multiply(Y_hat, (1 - Y_hat))
A1, W2, Z2, _ = caches[1]
dZ2 = dA2 * d_sigmoid(Z2)
dA1 = np.dot(W2.T, dZ2)
X, W1, Z1, _ = caches[0]
# let's introduce one NaN in Z1
Z1[0, 0] = np.nan
dZ1 = dA1 * d_relu(Z1)
dW1 = (1 / dZ1.shape[1]) * np.dot(dZ1, X.T)
dW1 = dW1.round(2)

display(
    Math(
        "\\langle \\nabla W_{111}, \\nabla W_{121} \\rangle="
        + sp.latex(sp.Matrix(dW1[0]))
    ),
    Math(
        "\\langle \\nabla W_{112}, \\nabla W_{122} \\rangle="
        + sp.latex(sp.Matrix(dW1[1]))
    ),
    Math(
        "\\langle \\nabla W_{113}, \\nabla W_{123} \\rangle="
        + sp.latex(sp.Matrix(dW1[2]))
    ),
)
$\displaystyle \langle \nabla W_{111}, \nabla W_{121} \rangle=\left[\begin{matrix}\text{NaN}\\\text{NaN}\end{matrix}\right]$
$\displaystyle \langle \nabla W_{112}, \nabla W_{122} \rangle=\left[\begin{matrix}0.09\\-0.07\end{matrix}\right]$
$\displaystyle \langle \nabla W_{113}, \nabla W_{123} \rangle=\left[\begin{matrix}-0.01\\0.01\end{matrix}\right]$

So we can’t simply do this.

\(\cfrac{\partial A_{1}}{\partial Z_{1}} = \cfrac{\partial}{\partial Z_{1}}\text{ReLU}(Z_1) = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x < 0\\\text{undefined if } x = 0\end{cases}\)

As per the convention, we’ll do this instead.

\(\cfrac{\partial A_{1}}{\partial Z_{1}} = \begin{cases}1 \text{ if } x > 0\\0 \text{ if } x \le 0\end{cases}\)

[52]:
def init_backward_prop(Y_hat):
    dAL = -(Y - Y_hat) / np.multiply(Y_hat, (1 - Y_hat))
    return dAL


def step_backward_prop(dA_prev, cache):
    # extract elements from cache
    A_prev, W, Z, activation = cache
    # propagate dA gradient through activation function
    # d_relu is nan when x is 0
    # instead of propagating nan's we make them 0
    func = {"sigmoid": d_sigmoid, "relu": lambda x: np.nan_to_num(d_relu(x))}
    dZ = dA_prev * func[activation](Z)
    # compute dW, db, dA_prev
    # dZ2 is (1, m) and A1 is (3, m), so dW1 is (1, 3)
    # dZ1 is (3, m) and X is (2, m), so dW0 is (3, 2)
    dW = (1 / dZ.shape[1]) * np.dot(dZ, A_prev.T)
    # dZ2 is (1, m) so db2 is (1, 1)
    # dZ1 is (3, m) so db0 is (1, 3)
    db = (1 / dZ.shape[1]) * np.sum(dZ, axis=1, keepdims=True)
    # W1 is (1, 3) and dZ2 is (1, m), so dA1 is (1, m)
    # W0 is (3, 2) and dZ1 is (3, m), so dX is (3, m) (not used though)
    dA_next = np.dot(W.T, dZ)
    return dA_next, dW, db


def backward_prop(Y_hat, caches):
    dA_prev = init_backward_prop(Y_hat)
    grads = params.copy()
    for layer in range(len(caches), 0, -1):
        dA, dW, db = step_backward_prop(dA_prev, caches[layer - 1])
        grads["W" + str(layer)] = dW
        grads["b" + str(layer)] = db
        dA_prev = dA
    return grads


grads = backward_prop(Y_hat, caches)
grads
[52]:
{'W1': array([[ 0.00495321, -0.01516951],
        [ 0.08724881, -0.07424792],
        [-0.00585036,  0.00528136]]),
 'b1': array([[ 0.00469289],
        [ 0.04855939],
        [-0.00072097]]),
 'W2': array([[0.0867413 , 0.14087237, 0.15088486]]),
 'b2': array([[0.06738911]])}

At this point it’s the usual gradient descent algorithm.

Let’s run it 5 times (each with up to 500 iterations of gradient descent) and pick the run with the lowest training loss.

[53]:
runs = []
for _ in range(5):
    history = ""
    params = init_nn_params([X.shape[0], 3, Y.shape[0]])
    Y_hat, caches = forward_prop(X, params)
    loss = compute_log_loss(Y, Y_hat)
    history += f"Iter 0   - Log loss={loss:.6f}\n"
    for i in range(1, 500 + 1):
        grads = backward_prop(Y_hat, caches)
        params = update_params(params, grads, learning_rate=0.5)
        Y_hat, caches = forward_prop(X, params)
        loss_new = compute_log_loss(Y, Y_hat)
        if loss - loss_new <= 1e-4:
            history += f"Iter {i} - Log loss={loss:.6f}\n"
            history += f"The algorithm has converged\n"
            break
        loss = loss_new
        if i % 100 == 0:
            history += f"Iter {i} - Log loss={loss:.6f}\n"
    runs.append((loss, params, history))

runs.sort(key=lambda x: x[0])
params = runs[0][1]
print(runs[0][2])
Iter 0   - Log loss=0.618392
Iter 100 - Log loss=0.137058
Iter 200 - Log loss=0.054018
Iter 300 - Log loss=0.033291
Iter 334 - Log loss=0.029567
The algorithm has converged

Let’s visualize the model.

[54]:
XX = np.c_[xx1.flatten(), xx2.flatten()].T
Y_hat, _ = forward_prop(XX, params)

final_model_plane = go.Surface(
    z=Y_hat.reshape(xx1.shape),
    x=xx1,
    y=xx2,
    colorscale=[[0, "#ff7f0e"], [1, "#1f77b4"]],
    showscale=False,
    opacity=0.5,
    name="final params",
)
fig.data[2].visible = False
fig.add_trace(final_model_plane)
fig.update_layout(
    title="Neural Network model",
)
fig.show()

Newton-Raphson method (week 3)#

Newton-Raphson method with one variable#

🔑 Newton-Raphson method is an algorithm to find the zero of a function

It can be used to optimize a function \(f(x)\) by finding the zero of \(f'(x)\).

The method works by starting from a random point \(x_0\) on \(f'(x)\), finding the tangent at \(f'(x_0)\) and finding where the tangent crosses the x-axis.

The idea is that this new point is closer than \(x_0\) to a minimum or maximum.

[55]:
def _update(frame):
    global fscat, dscat, dtang
    fscat.remove()
    dscat.remove()
    dtang.remove()
    fscat = ax.scatter(xs[frame], f(xs[frame]), color="tab:blue")
    dscat = ax.scatter(xs[frame], df(xs[frame]), color="tab:orange")
    (dtang,) = ax.plot(
        xx,
        ddf(xs[frame]) * (xx - xs[frame]) + df(xs[frame]),
        "--",
        color="tab:orange",
        alpha=0.5,
    )


x = sp.symbols("x")
f = sp.exp(x) - sp.log(x)
df = sp.lambdify(x, sp.diff(f, x, 1))
ddf = sp.lambdify(x, sp.diff(f, x, 2))
f = sp.lambdify(x, f)

xs = [1.75]
for _ in range(4):
    xs.append(xs[-1] - df(xs[-1]) / ddf(xs[-1]))

fig, ax = plt.subplots()
xx = np.linspace(1e-3, 2)
ax.plot(xx, f(xx), color="tab:blue", label="$f(x)$")
ax.plot(xx, df(xx), color="tab:orange", label="$f'(x)$")
fscat = ax.scatter([], [])
dscat = ax.scatter([], [])
(dtang,) = ax.plot([], [])
ax.set_ylim(-2, 11)
ax.set_xlim(0, 2)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.legend(loc="upper right")
plt.title("Minimazation of $f(x)$ using Newton's method")

ani = FuncAnimation(fig=fig, func=_update, frames=4, interval=1000)
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
    html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()

The equation of a line that passes for \((x_0, f(x_0))\) is the point-slope form

\(f(x) - f(x_0) = m(x-x_0)\)

\(f(x) = m(x - x_0) + f(x_0)\)

We substitute \(f'(x)\) for \(f(x)\), \(f'(x_0)\) for \(f(x_0)\), and the derivative of \(f'(x)\) at \(x_0\) (ie the second derivative \(f''(x_0)\)) for \(m\) and we obtain

\(f'(x) = f''(x_0)(x - x_0) + f'(x_0)\)

Now we want to find the \(x\) such that \(f'(x) = 0\), that is the point where the tangent crosses the x-axis.

\(0 = f''(x_0)(x - x_0) + f'(x_0)\)

\(-f''(x_0)(x - x_0) = f'(x_0)\)

\(-(x - x_0) = \cfrac{f'(x_0)}{f''(x_0)}\)

\(-x + x_0 = \cfrac{f'(x_0)}{f''(x_0)}\)

\(-x = - x_0 + \cfrac{f'(x_0)}{f''(x_0)}\)

\(x = x_0 - \cfrac{f'(x_0)}{f''(x_0)}\)

The pseudo-code might be looking something along these lines

\(\begin{array}{l} \textbf{Algorithm: } \text{Newton's Method for Univariate Optimization} \\ \textbf{Input: } \text{function to minimize } \mathcal{f}, \text{initial guess } x_0, \text{number of iterations } K, \text{convergence } \epsilon \\ \textbf{Output: } x \\ 1 : x \gets x_0 \\ 2 : \text{for k = 1 to K do} \\ 3 : \quad x_{new} = x - f'(x) / f''(x) \\ 4 : \quad \text{if } \|f'(x_{new}) / f''(x_{new})\| < \epsilon \text{ then} \\ 5 : \quad \quad \text{return } x \\ 6 : \quad \text {end if} \\ 7 : \quad x \gets x_{new} \\ 8 : \text{return } x \\ \end{array}\)

Another way to find \(x\) such that \(f'(x) = 0\) is through the rise over run equation.

\(f''(x_0) = \cfrac{f'(x_0)}{x_0 - x}\)

\(\cfrac{1}{f''(x_0)} = \cfrac{x_0 - x}{f'(x_0)}\)

\(\cfrac{f'(x_0)}{f''(x_0)} = x_0 - x\)

\(\cfrac{f'(x_0)}{f''(x_0)} - x_0 = - x\)

\(x_0 - \cfrac{f'(x_0)}{f''(x_0)} = x\)

A third way to find \(x\) such that \(f'(x) = 0\) is through expressing \(f(x)\) as a quadratic approximation (Taylor series truncated at the second order).

A Taylor series of \(f(x)\) that is differentiable at \(x_0\) is

\(f(x)=\sum_{n=0}^\infty\cfrac{f^{(n)}(x_0)(x-x_0)^n}{n!}\)

Before proceeding let’s take a look at what a Taylor series is.

[56]:
def _update(order):
    global taylor
    taylor.remove()
    series = np.zeros((order, 50))
    for n in range(1, order + 1):
        series[n - 1] = (
            (1 / math.factorial(n))
            * sp.diff(expr, x, n).evalf(subs={x: 0})
            * (xx**n)
        )
    (taylor,) = ax.plot(xx, np.sum(series, axis=0), color="tab:orange")


x = sp.symbols("x")
expr = sp.sin(x)
sin = sp.lambdify(x, expr, "numpy")
xx = np.linspace(-10, 10)
fig, ax = plt.subplots()
ax.plot(xx, sin(xx), color="tab:blue", label="$\sin(x)$")
(taylor,) = ax.plot([], [], color="tab:orange", label="Taylor series")
ax.set_aspect("equal")
ax.set_ylim(-10, 10)
ax.set_xlim(-10, 10)
plt.legend(loc="upper right")
plt.title("Taylor series of order 1, 3, ..., 25 of sine function")


ani = FuncAnimation(fig=fig, func=_update, frames=range(1, 27, 2))
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
    html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()

🔑 The Taylor series of a function is an infinite sum of terms (polynomials) that are expressed in terms of the function derivatives at a point.

The quadratic approximation of \(f(x)\) (Taylor series truncated at the second degree polynomial) is

\(f(x) \approx f(x_0) + f'(x_0)(x-x_0) + \cfrac{1}{2}f''(x_0)(x-x_0)^2\)

To find \(x\) such that \(f'(x) = 0\), we set the derivative of the quadratic approximation to zero.

\(\cfrac{d}{dx}f(x_0) + f'(x_0)(x-x_0) + \cfrac{1}{2}f''(x_0)(x-x_0)^2 = 0\)

\(f'(x_0) + f''(x_0)(x-x_0) = 0\)

\(f''(x_0)(x-x_0) = -f'(x_0)\)

\(x-x_0 = -\cfrac{f'(x_0)}{f''(x_0)}\)

\(x = x_0 -\cfrac{f'(x_0)}{f''(x_0)}\)

This derivation of the Newton’s method update formula from the quadratic approximation of a function has a few important implications:

  • At each step, Newton’s method finds the minimum of the quadratic approximation of \(f(x)\)

  • When \(f(x)\) is indeed quadratic, Newton’s method will find the minimum or maximum in one single step

  • When \(f(x)\) is not quadratic, Newton’s method will exhibit quadratic convergence near the minimum or maximum. Quadratic convergence refers to the rate at which the sequence \({x_k}\) approaches the minimum or maximum. The “error” will decrease quadratically at each step.

[57]:
def _update(frame):
    global fscat, taylor, taylor_min
    fscat.remove()
    taylor.remove()
    taylor_min.remove()
    fscat = ax.scatter(xs[frame], f(xs[frame]), color="tab:blue")
    (taylor,) = ax.plot(
        xx,
        f(xs[frame])
        + df(xs[frame]) * (xx - xs[frame])
        + (1 / 2) * ddf(xs[frame]) * (xx - xs[frame]) ** 2,
        "--",
        color="tab:orange",
        alpha=0.5,
        label=r"$f(x_0) + f'(x_0)(x-x_0) + \dfrac{1}{2}f''(x_0)(x-x_0)^2$",
    )
    taylor_min = ax.scatter(
        xs[frame + 1],
        f(xs[frame])
        + df(xs[frame]) * (xs[frame + 1] - xs[frame])
        + (1 / 2) * ddf(xs[frame]) * (xs[frame + 1] - xs[frame]) ** 2,
        color="tab:orange",
    )


x = sp.symbols("x")
f = sp.exp(x) - sp.log(x)
df = sp.lambdify(x, sp.diff(f, x, 1))
ddf = sp.lambdify(x, sp.diff(f, x, 2))
f = sp.lambdify(x, f)

xs = [1.75]
for _ in range(4):
    xs.append(xs[-1] - df(xs[-1]) / ddf(xs[-1]))

fig, ax = plt.subplots()
xx = np.linspace(1e-3, 2)
ax.plot(xx, f(xx), color="tab:blue", label="$f(x)$")
fscat = ax.scatter([], [])
(taylor,) = ax.plot(
    [],
    [],
    color="tab:orange",
    alpha=0.5,
    label=r"$f(x_0) + f'(x_0)(x-x_0) + \dfrac{1}{2}f''(x_0)(x-x_0)^2$",
)
taylor_min = ax.scatter([], [])
ax.set_ylim(-2, 11)
ax.set_xlim(0, 2)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$f(x)$", transform=ax.get_xaxis_transform())
plt.legend(loc="upper right")
plt.title(
    "At each step, Newton's method finds\nthe minimum of the quadratic approximation of $f(x)$"
)

ani = FuncAnimation(fig=fig, func=_update, frames=4, interval=1000)
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
    html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()

Finally, let’s inspect the algorithm.

[58]:
def univariate_newtons_method(f, symbols, initial, steps):
    x = symbols
    df = sp.lambdify(x, sp.diff(f, x, 1))
    ddf = sp.lambdify(x, sp.diff(f, x, 2))
    p = np.zeros((steps, 1))
    g = np.zeros((steps, 1))
    h = np.zeros((steps, 1))
    step_vector = np.zeros((steps, 1))
    p[0] = initial
    g[0] = df(p[0])
    h[0] = ddf(p[0])
    step_vector[0] = df(p[0]) / ddf(p[0])
    for i in range(1, steps):
        p[i] = p[i - 1] - step_vector[i - 1]
        g[i] = df(p[i])
        h[i] = ddf(p[i])
        step_vector[i] = df(p[i]) / ddf(p[i])
        if np.linalg.norm(step_vector[i]) < 1e-4:
            break
    return p[: i + 1], g[: i + 1], h[: i + 1], step_vector[: i + 1]


x = sp.symbols("x")
f = sp.exp(x) - sp.log(x)
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(
    Math(
        "\\arg \\min \\limits_{x}" + sp.latex(f) + f"={p[-1].squeeze():.4f}"
    )
)
print(f"Newton's method converged after {len(p)-1} steps")
$\displaystyle \arg \min \limits_{x}e^{x} - \log{\left(x \right)}=0.5671$
Newton's method converged after 4 steps

Let’s consider the quadratic function \(f(x) = x^2\) which has a minimum at 0.

Based on what we said above we expect Newton’s method to find this minimum in one step.

[59]:
x = sp.symbols("x")
f = x**2
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(
    Math(
        "\\arg \\min \\limits_{x}" + sp.latex(f) + f"={p[-1].squeeze():.4f}"
    )
)
print(f"Newton's method converged after {len(p)-1} steps")
$\displaystyle \arg \min \limits_{x}x^{2}=0.0000$
Newton's method converged after 1 steps

Let’s consider the quadratic function \(f(x) = -x^2\) which has a maximum at 0.

Based on what we said above we expect Newton’s method to find this maximum in one step.

[60]:
x = sp.symbols("x")
f = -(x**2)
p, _, _, _ = univariate_newtons_method(f, symbols=x, initial=1.75, steps=10)
display(
    Math(
        "\\arg \\min \\limits_{x}"
        + sp.latex(f)
        + f"={p[-1].squeeze():.4f}"
    )
)
print(f"Newton's method converged after {len(p)-1} steps")
$\displaystyle \arg \min \limits_{x}- x^{2}=0.0000$
Newton's method converged after 1 steps

Second derivatives#

In the previous section we introduced second derivatives without too much explanation of what they are.

After all, analytically, it’s just the derivative of a derivative.

But what do they represent and what they’re used for?

Consider this analogy.

🔑 Let \(f(t)\) be the distance, then the first derivative is velocity (instantaneous rate of change of distance wrt time), then the second derivative is acceleration (instantaneous rate of change of velocity wrt time).

[61]:
def _update(raw_frame):
    global distance_point, velocity_point, acceleration_point
    distance_point.remove()
    velocity_point.remove()
    acceleration_point.remove()
    frame = int(frame_func(raw_frame))
    xxf = xx[: frame + 1]
    distance_line = ax.plot(
        xxf, sp.lambdify(x, norm_pdf, "numpy")(xxf), c="tab:blue"
    )
    velocity_line = ax.plot(
        xxf,
        sp.lambdify(x, sp.diff(norm_pdf, x, 1), "numpy")(xxf),
        c="tab:orange",
    )
    acceleration_line = ax.plot(
        xxf,
        sp.lambdify(x, sp.diff(norm_pdf, x, 2), "numpy")(xxf),
        c="tab:green",
    )
    distance_point = ax.scatter(
        xx[frame], sp.lambdify(x, norm_pdf, "numpy")(xx[frame]), color="tab:blue"
    )
    velocity_point = ax.scatter(
        xx[frame],
        sp.lambdify(x, sp.diff(norm_pdf, x, 1), "numpy")(xx[frame]),
        color="tab:orange",
    )
    acceleration_point = ax.scatter(
        xx[frame],
        sp.lambdify(x, sp.diff(norm_pdf, x, 2), "numpy")(xx[frame]),
        color="tab:green",
    )


x = sp.symbols("x")
norm_pdf = (1 / sp.sqrt(2 * np.pi)) * sp.exp(-(x**2) / 2)
xx = np.linspace(-3, 3, 100)
fig, ax = plt.subplots()
distance_line = ax.plot([], [], c="tab:blue", label="distance")
velocity_line = ax.plot([], [], c="tab:orange", label="velocity")
acceleration_line = ax.plot([], [], c="tab:green", label="acceleration")
distance_point = ax.scatter([], [])
velocity_point = ax.scatter([], [])
acceleration_point = ax.scatter([], [])
ax.spines["bottom"].set_position(("data", 0))
plt.xlim(-3, 3)
plt.ylim(-0.5, 0.5)
plt.legend()
plt.title("Distance, velocity and acceleration analogy")

# Create velocity as a function of frame
# Takes values between 0 and 100 as arguments and returns the velocity-adjusted frame value.
# For example between 0 and 20 velocity is slow
# Frames instead of going 0, 1, 2 will go 0, 0, 1 etc.
velocity_func = sp.lambdify(x, sp.diff(norm_pdf, x, 1), "numpy")
velocity_magnitudes = np.abs(velocity_func(xx))
normalized_magnitudes = (
    velocity_magnitudes / np.sum(velocity_magnitudes) * (len(xx) - 1)
)
adjusted_frame_positions = np.cumsum(normalized_magnitudes)
frame_func = interp1d(
    np.arange(len(xx)),
    adjusted_frame_positions,
    kind="linear",
    fill_value="extrapolate",
)

ani = FuncAnimation(fig=fig, func=_update, frames=len(xx))
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
    html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()

🔑 The second derivative tells us about the curvature of a function, or in other words if a function is concave up or concave down.

When the second derivative is positive the function is concave up.

When the second derivative is negative the function is concave down.

When the second derivative is 0 the curvature of the function is undetermined.

This information in combination with the first derivative tells us about our position relative to a minimum or maximum.

\(f'(x) > 0\) and \(f''(x) > 0\): we’re on the right hand side of a concave up function (minimum to our left) (like between -3 and -1 on the graph above).

\(f'(x) > 0\) and \(f''(x) < 0\): we’re on the left hand side of a concave down function (maximum to our right) (like between -1 and 0 on the graph above).

\(f'(x) < 0\) and \(f''(x) < 0\): we’re on the right hand side of a concave down function (maximum to our left) (like between 0 and 1 on the graph above).

\(f'(x) < 0\) and \(f''(x) > 0\): we’re on the left hand side of a concave up function (minimum to our right) (like between 1 and 3 on the graph above).

Let’s see what happens when the (vanilla) Newton’s method gets to a point where the second derivative is 0.

[62]:
with warnings.catch_warnings():
    warnings.filterwarnings("error")
    _, _, _, _ = univariate_newtons_method(norm_pdf, symbols=x, initial=1, steps=10)
---------------------------------------------------------------------------
RuntimeWarning                            Traceback (most recent call last)
Cell In[62], line 3
      1 with warnings.catch_warnings():
      2     warnings.filterwarnings("error")
----> 3     _, _, _, _ = univariate_newtons_method(norm_pdf, symbols=x, initial=1, steps=10)

Cell In[58], line 12, in univariate_newtons_method(f, symbols, initial, steps)
     10 g[0] = df(p[0])
     11 h[0] = ddf(p[0])
---> 12 step_vector[0] = df(p[0]) / ddf(p[0])
     13 for i in range(1, steps):
     14     p[i] = p[i - 1] - step_vector[i - 1]

RuntimeWarning: divide by zero encountered in divide

Second-order partial derivatives#

For functions of more than one variable we have partial derivatives as well as second-order partial derivatives.

For example, let \(f(x, y) = 2x^2+3y^2 - xy\). We have 2 partial derivatives

\(\cfrac{\partial f}{\partial x} = 4x - y\)

\(\cfrac{\partial f}{\partial y} = 6y - x\)

and 4 second-order partial derivatives

\(\partial\cfrac{\partial f / \partial x}{\partial x} = \cfrac{\partial^2 f}{\partial x^2} = 4\)

\(\partial\cfrac{\partial f / \partial x}{\partial y} = \cfrac{\partial^2 f}{\partial x \partial y} = -1\)

\(\partial\cfrac{\partial f / \partial y}{\partial y} = \cfrac{\partial^2 f}{\partial y^2} = 6\)

\(\partial\cfrac{\partial f / \partial y}{\partial x} = \cfrac{\partial^2 f}{\partial y \partial x} = -1\)

The fact that \(\cfrac{\partial^2 f}{\partial x \partial y} = \cfrac{\partial^2 f}{\partial y \partial x}\) is not a coincidence but rather the symmetry property of partial second-order derivatives.

Hessian matrix#

If we can summarize partial derivatives into a gradient vector \(\nabla_f = \langle \cfrac{\partial f}{\partial x}, \cfrac{\partial f}{\partial y} \rangle\), it seems almost obvious that we can summarize second-order partial derivatives into a matrix.

Such matrix is called Hessian.

\(H_{f(x, y)} = \begin{bmatrix} \cfrac{\partial^2 f}{\partial x^2}&&\cfrac{\partial^2 f}{\partial x \partial y}\\ \cfrac{\partial^2 f}{\partial y \partial x}&&\cfrac{\partial^2 f}{\partial y^2} \end{bmatrix}\)

🔑 The Hessian matrix is a symmetric matrix containing the second-order partial derivatives of a function

Continuing the previous example, the Hessian matrix of \(f(x, y) = 2x^2+3y^2 - xy\) (at any point over its domain) is

\(H = \begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\)

We can also show that the original function can be expressed as

\(q(x, y) = \cfrac{1}{2}\begin{bmatrix}x&&y\end{bmatrix}\begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\begin{bmatrix}x\\y\end{bmatrix}\)

\(= \cfrac{1}{2}\begin{bmatrix}x&&y\end{bmatrix}\begin{bmatrix}4x - y\\6y - x\end{bmatrix}\)

\(= \cfrac{1}{2}(x(4x - y)+y(6y - x))\)

\(= \cfrac{1}{2}(4x^2 - xy + 6y^2 - xy)\)

\(= 2x^2 + 3y^2 - xy\)

It turns out that \(q(x, y) = x^THx\) creates a concave up function if \(H\) is a positive-definite matrix (\(x^THx>0\)) and a concave down function if H is a negative-definite matrix (\(x^THx<0\)).

How do we determine the positive definiteness of \(H\)?

With its eigenvalues.

Recall that an eigenvector is a vector \(x\) such that \(Hx = \lambda x\).

If we multiply both sides by \(x^T\) we get \(x^THx = \lambda x^Tx\).

An eigenvector is usually scaled such that its norm is 1. The scaling doesn’t change its status of eigenvector nor the eigenvalue. This because \(H(cx) = \lambda (cx)\) where \(c\) is a scaling constant.

So if \(\|x\| = x^Tx = 1\), we get that \(x^THx = \lambda\).

Let’s verify this, before proceeding any further.

[63]:
H = np.array([[4, -1], [-1, 6]])
eigvals, eigvecs = np.linalg.eig(H)
xTHx = np.dot(eigvecs[:, 0].T, H).dot(eigvecs[:, 0])
lamxTx = eigvals[0] * np.dot(eigvecs[:, 0].T, eigvecs[:, 0].T)

assert np.isclose(xTHx, lamxTx)
assert np.isclose(xTHx, eigvals[0])

Based on the eigenvalues of \(H\) we can conclude one of the following about the curvature of a function: - the function at a point is concave up if the the eigenvalues of \(H\) are all strictly positive (positive-definite) - the function at a point is concave down if the the eigenvalues of \(H\) are all strictly negative (negative-definite) - the point of the function is a saddle point if the the eigenvalues of \(H\) are both strictly positive and negative (indefinite) - we can’t conclude anything if one of the eigenvalues is zero (positive semi-definite or negative semi-definite.)

Netwon-Raphson method with more than one variable#

Recall in the univariate case the update of the parameter was \(x_{new} = x - f'(x) / f''(x)\).

Let \(p = (x, y)\). In the bivariate case, the update of the parameter is \(p_{new} = p - H_f^{-1}(p) \cdot \nabla_f(p)\), where \(H_f^{-1}(p)\) is 2x2 matrix and \(\nabla_f(p)\) is a 2x1 column vector, so it results that \(p_{new}\) is a 2x1 column vector.

\(\begin{array}{l} \textbf{Algorithm: } \text{Newton's Method for Multivariate Optimization} \\ \textbf{Input: } \text{function to minimize } \mathcal{f}, \text{initial guess } p_0, \text{number of iterations } K, \text{convergence } \epsilon \\ \textbf{Output: } p \\ 1 : p \gets p_0 \\ 2 : \text{for k = 1 to K do} \\ 3 : \quad p_{new} = p - H^{-1}(p)\nabla f(p) \\ 4 : \quad \text{if } \|H^{-1}(p)\nabla f(p)\| < \epsilon \text{ then} \\ 5 : \quad \quad \text{return } p \\ 6 : \quad \text {end if} \\ 7 : \quad p \gets p_{new} \\ 8 : \text{return } p \\ \end{array}\)

Continuing the previous example, the gradient and the Hessian are

\(\nabla = \begin{bmatrix}4x-y\\-x+6y\end{bmatrix}\)

\(H = \begin{bmatrix}4&&-1\\-1&&6\end{bmatrix}\)

Let’s practice inverting the Hessian using the row-echelon form method.

[64]:
x, y = sp.symbols("x, y")
f = 2 * x**2 + 3 * y**2 - x * y

H = sp.hessian(f, (x, y))
HI = H.row_join(sp.Matrix.eye(2))
HI
[64]:
$\displaystyle \left[\begin{matrix}4 & -1 & 1 & 0\\-1 & 6 & 0 & 1\end{matrix}\right]$
[65]:
HI = HI.elementary_row_op("n->n+km", row=1, k=1 / 4, row1=1, row2=0)
HI
[65]:
$\displaystyle \left[\begin{matrix}4 & -1 & 1 & 0\\0 & 5.75 & 0.25 & 1\end{matrix}\right]$
[66]:
HI = HI.elementary_row_op("n->kn", row=1, k=4 / 23)
HI
[66]:
$\displaystyle \left[\begin{matrix}4 & -1 & 1 & 0\\0 & 1.0 & 0.0434782608695652 & 0.173913043478261\end{matrix}\right]$
[67]:
HI = HI.elementary_row_op("n->n+km", row=0, k=1, row1=0, row2=1)
HI
[67]:
$\displaystyle \left[\begin{matrix}4 & 0 & 1.04347826086957 & 0.173913043478261\\0 & 1.0 & 0.0434782608695652 & 0.173913043478261\end{matrix}\right]$
[68]:
HI = HI.elementary_row_op("n->kn", row=0, k=1 / 4)
HI
[68]:
$\displaystyle \left[\begin{matrix}1.0 & 0 & 0.260869565217391 & 0.0434782608695652\\0 & 1.0 & 0.0434782608695652 & 0.173913043478261\end{matrix}\right]$

The inverse of the Hessian is

[69]:
HI[-2:, -2:]
[69]:
$\displaystyle \left[\begin{matrix}0.260869565217391 & 0.0434782608695652\\0.0434782608695652 & 0.173913043478261\end{matrix}\right]$

Let’s verify it.

[70]:
assert np.array_equal(
    np.array(H.inv(), dtype="float"), np.array(HI[-2:, -2:], dtype="float")
)

Let’s calculate \(H_f^{-1} \cdot \nabla_f\)

[71]:
H.inv() @ sp.Matrix([sp.diff(f, x), sp.diff(f, y)])
[71]:
$\displaystyle \left[\begin{matrix}x\\y\end{matrix}\right]$

So \(\begin{bmatrix}x_{new}\\y_{new}\end{bmatrix} = \begin{bmatrix}x\\y\end{bmatrix} - \begin{bmatrix}x\\y\end{bmatrix}\)

In this case, regardless of our starting point we get that the minimum is at [0, 0] which is consistent with

\(\begin{cases}4x-y=0\\-x+6y=0\end{cases}\)

\(\begin{cases}x=\cfrac{y}{4}\\-\cfrac{y}{4}+6y=0\end{cases}\)

\(\begin{cases}x=\cfrac{y}{4}\\\cfrac{23y}{4}=0\end{cases}\)

\(\begin{cases}x=0\\y=0\end{cases}\)

Let’s verify we get the same result with the Newton’s method.

[72]:
def bivariate_newtons_method(f, symbols, initial, steps):
    x, y = symbols
    H = sp.hessian(f, (x, y))
    hessian_eval = sp.lambdify((x, y), H, "numpy")
    nabla = sp.Matrix([sp.diff(f, x), sp.diff(f, y)])
    nabla_eval = sp.lambdify((x, y), nabla, "numpy")
    H_dot_nabla = sp.lambdify((x, y), H.inv() @ nabla, "numpy")
    p = np.zeros((steps, 2))
    g = np.zeros((steps, 2))
    h = np.zeros((steps, 2, 2))
    step_vector = np.zeros((steps, 2))
    p[0] = initial
    g[0] = nabla_eval(*p[0]).squeeze()
    h[0] = hessian_eval(*p[0]).squeeze()
    step_vector[0] = H_dot_nabla(*p[0]).squeeze()
    for i in range(1, steps):
        p[i] = p[i - 1] - step_vector[i - 1]
        g[i] = nabla_eval(*p[i]).squeeze()
        h[i] = hessian_eval(*p[i]).squeeze()
        step_vector[i] = H_dot_nabla(*p[i]).squeeze()
        if np.linalg.norm(step_vector[i]) < 1e-4:
            break
    return p[: i + 1], g[: i + 1], h[: i + 1], step_vector[: i + 1]


x, y = sp.symbols("x, y")
f = 2 * x**2 + 3 * y**2 - x * y
p, _, h, _ = bivariate_newtons_method(
    f, symbols=(x, y), initial=np.array([1, 3]), steps=10
)

display(
    Math(
        "\\arg \\min \\limits_{x,y}"
        + sp.latex(f)
        + "="
        + sp.latex(np.array2string(p[-1], precision=2, suppress_small=True))
    )
)
print(f"Newton's method converged after {len(p)-1} steps")
print(
    f"Eigenvalues of H: {np.array2string(np.linalg.eigvals(h[-1]), precision=2, suppress_small=True)}"
)
$\displaystyle \arg \min \limits_{x,y}2 x^{2} - x y + 3 y^{2}=\mathtt{\text{[0. 0.]}}$
Newton's method converged after 1 steps
Eigenvalues of H: [3.59 6.41]

Since the function is quadratic Newton’s method only needed one step to find the critical point, which is a minimum because the Hessian at the critical point is positive-definite.

Let’s consider a non-quadratic function.

[73]:
x, y = sp.symbols("x, y")
g = x**4 + 0.8 * y**4 + 4 * x**2 + 2 * y**2 - x * y - 0.2 * x**2 * y
p, _, h, _ = bivariate_newtons_method(
    g, symbols=(x, y), initial=np.array([4, 4]), steps=10
)

display(
    Math(
        "\\arg \\min \\limits_{x,y}"
        + sp.latex(g)
        + "="
        + sp.latex(np.array2string(p[-1], precision=2, suppress_small=True))
    )
)
print(f"Newton's method converged after {len(p)-1} steps")
print(
    f"Eigenvalues of H: {np.array2string(np.linalg.eigvals(h[-1]), precision=2, suppress_small=True)}"
)
$\displaystyle \arg \min \limits_{x,y}x^{4} - 0.2 x^{2} y + 4 x^{2} - x y + 0.8 y^{4} + 2 y^{2}=\mathtt{\text{[-0. 0.]}}$
Newton's method converged after 7 steps
Eigenvalues of H: [8.24 3.76]

The critical point \([0, 0]\) is a minimum because the Hessian is positive-definite.

Lagrange multiplier method (extra)#

🔑 The method of Lagrange multiplier is a method of finding the local minima or local maxima of a function subject to equality or inequality constraints

Let’s say we want to find the maximum or minimum of \(f(x, y) = 5x - 3y\) subject to the constraint \(g(x, y) = 136\) where \(g(x, y) = x^2 + y^2\).

A constrained minimization problem \(\textbf{P}\) would be expressed as

\(\begin{array}{l} \textbf{P} \\ \min \limits_{x,y} 5x - 3y \\ \text{subject to} \quad x^2 + y^2 = 136 \end{array}\)

[74]:
x, y = sp.symbols("x, y")
f = 5 * x - 3 * y
g = x**2 + y**2

x_ = np.linspace(-15, 15, 250)
y_ = np.linspace(-15, 15, 250)
xx, yy = np.meshgrid(x_, y_)

fig, ax = plt.subplots()
cs1 = ax.contour(xx, yy, sp.lambdify((x, y), f, "numpy")(xx, yy), levels=20, cmap="RdBu_r")
cs2 = ax.contour(xx, yy, sp.lambdify((x, y), g, "numpy")(xx, yy), levels=[136], colors="k")
ax.clabel(cs1, inline=True, fontsize=10)
ax.clabel(cs2, inline=True, fontsize=10)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-15, 15)
ax.set_ylim(-15, 15)
ax.set_aspect("equal")
plt.title("Contour plot of $f(x, y)$ with constraint $g(x) = 136$", pad=20.)
plt.show()
../_images/notebooks_calculus_204_0.png

The plot above might not make a lot of sense at a first glance. To hopefully gain a better intuition let’s consider a different visualization.

[75]:
f_values = sp.lambdify((x, y), f, "numpy")(xx, yy)
f_surface = go.Surface(
    z=f_values,
    x=xx,
    y=yy,
    colorscale="RdBu_r",
    contours=dict(z=dict(start=-105, end=105, size=15, show=True)),
    colorbar=dict(title="f(x, y)"),
    name="f(x, y)",
)

g_values = sp.lambdify((x, y), g, "numpy")(xx, yy)
mask = np.isclose(g_values, 136, atol=5.0)
z_masked = f_values.copy()
x_masked = xx.copy()
y_masked = yy.copy()
z_masked[~mask] = np.nan
x_masked[~mask] = np.nan
y_masked[~mask] = np.nan

g_constraint = go.Scatter3d(
    z=z_masked.flatten(),
    x=x_masked.flatten(),
    y=y_masked.flatten(),
    marker=dict(color="#000000", size=1),
    showlegend=False,
    name="g(x, y)",
)

fig = go.Figure([f_surface, g_constraint])
fig.update_layout(
    title="f(x, y) with projected constraint g(x) = 136 onto it",
    autosize=False,
    width=600,
    height=600,
    scene_aspectmode="cube",
    margin=dict(l=10, r=10, b=10, t=30),
    scene_camera=dict(
        eye=dict(x=-1.8, y=1.4, z=0.7),
    ),
)
fig.show()

\(f(x, y)\) is a linear function so its minimum and maximum are \(-\infty\) and \(+\infty\) respectively.

The constraint bounds its minimum and maximum to values such that \(x\) and \(y\) lies within the circle \(x^2 + y^2\).

If we hover the cursor over the boundary of the constraint, where the surface has the deepest blue possible, we’ll find the minimum of this constrained problem which is around -68.

The contour plot we saw earlier with just the contour of \(f(x, y) = -68\) gives an insight of how we can solve this problem analytically.

[76]:
_, ax = plt.subplots()
cs1 = ax.contour(xx, yy, sp.lambdify((x, y), f, "numpy")(xx, yy), levels=[-68], cmap="RdBu_r")
cs2 = ax.contour(xx, yy, sp.lambdify((x, y), g, "numpy")(xx, yy), levels=[136], colors="k")
ax.clabel(cs1, inline=True, fontsize=10)
ax.clabel(cs2, inline=True, fontsize=10)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-15, 15)
ax.set_ylim(-15, 15)
ax.set_aspect("equal")
plt.title("Contour plot of $f(x, y)=-68$ with constraint $g(x) = 136$", pad=20.)
plt.show()
../_images/notebooks_calculus_208_0.png

The contour \(f(x, y) = 68\) is tangent to the contour \(g(x, y) = 136\).

It turns out there is an interesting geometric interpretation of the gradient.

🔑 The gradient of a function evaluated at a point can be seen as vector perpendicular to the contour line passing through that point.

Let’s verify this property, while also checking how the gradients of \(f(x, y)\) and \(g(x, y)\) evaluated at the minimum are relative to each other.

[77]:
vf = np.array(
    sp.Matrix([sp.diff(f, x), sp.diff(f, y)]).evalf(subs={x: -10, y: 6}),
    dtype="float",
).squeeze()
uvf = vf / np.linalg.norm(vf)
vg = np.array(
    sp.Matrix([sp.diff(g, x), sp.diff(g, y)]).evalf(subs={x: -10, y: 6}),
    dtype="float",
).squeeze()
uvg = vg / np.linalg.norm(vg)

print(
    "The dot product of the unit vectors represented by the normalized gradients of "
    + f"f(x, y) and g(x, y) evaluated at [-10, 6] is {np.dot(uvf, uvg):.2f}, that is "
    + f"their angle is {np.degrees(np.arccos(np.dot(uvf, uvg))):.0f} degree."
)

_, ax = plt.subplots()
cs1 = ax.contour(
    xx, yy, sp.lambdify((x, y), f, "numpy")(xx, yy), levels=[-68], cmap="RdBu_r"
)
cs2 = ax.contour(
    xx, yy, sp.lambdify((x, y), g, "numpy")(xx, yy), levels=[136], colors="k"
)
ax.clabel(cs1, inline=True, fontsize=10)
ax.clabel(cs2, inline=True, fontsize=10)
plt.quiver(
    [-10, -10],
    [6, 6],
    [uvf[0], uvg[0]],
    [uvf[1], uvg[1]],
    angles="xy",
    scale=0.3,
    scale_units="xy",
)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-15, 15)
ax.set_ylim(-15, 15)
ax.set_aspect("equal")
plt.title("The gradients of f(x, y)=68 and g(x, y)=136 are parallel", pad=20.0)
plt.show()
The dot product of the unit vectors represented by the normalized gradients of f(x, y) and g(x, y) evaluated at [-10, 6] is -1.00, that is their angle is 180 degree.
../_images/notebooks_calculus_210_1.png

The gradients of \(f(x, y)\) and \(g(x, y)\) are \(\nabla f(x,y) = \begin{bmatrix}5\\-3\end{bmatrix}\) and \(\nabla g(x,y) = \begin{bmatrix}2x\\2y\end{bmatrix}\).

At the point \([-10, 6]\) the two gradients are \(\nabla f(-10,6) = \begin{bmatrix}5\\-3\end{bmatrix}\) and \(\nabla g(-10,6) = \begin{bmatrix}-20\\12\end{bmatrix}\).

The two vectors are parallel, so we can multiply one by some constant to get the other.

\(\begin{bmatrix}5\\-3\end{bmatrix} = -\cfrac{1}{4} \begin{bmatrix}-20\\12\end{bmatrix}\)

\(\nabla f(-10,6) = -\cfrac{1}{4} \nabla g(-10,6)\)

In general, we can conclude that a contour line of \(f(x, y)\) is tangent to a contour line of \(g(x, y)\) at the point \([x_0, y_0]\) if

\(\nabla f(x_0, y_0) = \lambda \nabla g(x_0, y_0)\)

In our example, this equivalence generates the following system of equations.

\(\begin{cases} 5 = \lambda 2x\\ -3 = \lambda 2y \end{cases}\)

At this point one might be tempted to solve it for \(x\), \(y\) and \(\lambda\), but the system is underdetermined because there are more unknowns than equations in the system.

Moreover, the system only helps us find a contour of \(f(x, y)\) that is tangent to a contour of \(g(x, y)\).

However, we are not interested in just any contour. We want it to be tangent to the contour \(g(x, y) = 136\).

The system we need to solve is

\(\begin{cases} 5 = \lambda 2x\\ -3 = \lambda 2y\\ x^2 + y^2 = 136\\ \end{cases}\)

Let’s practice solving system of equations manually.

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ x^2 + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25}{4\lambda^2} + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25}{4} (-\cfrac{3}{2y})^{-2} + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25}{4} 3^{-2} 4y^2 + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25}{4} \cfrac{4y^2}{3^2} + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25y^2}{3^2} + y^2 = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{25y^2+ 9y^2}{9} = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ \cfrac{34y^2}{9} = 136\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ 34y^2 = 1224\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ y^2 = 36\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ y = 6\\ y = -6\\ \end{cases}\)

Let’s solve for \(y = 6\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{2y} = \lambda\\ y = 6\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{\lambda 2} = x\\ -\cfrac{3}{12} = \lambda\\ y = 6\\ \end{cases}\)

\(\begin{cases} \cfrac{5}{-\frac{2}{4}} = x\\ -\cfrac{1}{4} = \lambda\\ y = 6\\ \end{cases}\)

\(\begin{cases} x = -10\\ \lambda = -\cfrac{1}{4}\\ y = 6\\ \end{cases}\)

For \(y = -6\) we swap signs

\(\begin{cases} x = 10\\ \lambda = \cfrac{1}{4}\\ y = -6\\ \end{cases}\)

Let’s verify it.

[78]:
x, y = sp.symbols("x, y")
f = 5 * x - 3 * y
g = x**2 + y**2
lam = sp.symbols("lambda")
solutions = sp.solve(
    [
        sp.diff(f, x) - lam * sp.diff(g, x),
        sp.diff(f, y) - lam * sp.diff(g, y),
        g - 136,
    ],
    [x, y, lam],
    dict=True,
)

for s in solutions:
    z = sp.lambdify((x, y), f, "numpy")(s[x], s[y])
    fig.add_scatter3d(
        x=[float(s[x])],
        y=[float(s[y])],
        z=[float(z)],
        showlegend=False,
        marker=dict(color="#67001F" if z > 0 else "#053061"),
        name="maximum" if z > 0 else "minimum",
    )
fig.update_layout(
    title="Maximum and minimum of the constrained problem",
)
fig.show()

We basically explained how the Lagrange multiplier method works, but at this point one might be wondering where the name comes from.

It turns out that the system can be expressed using the partial derivatives of the Lagrangian function

\(\mathcal{L}(x, y, \lambda) = f(x, y) - \lambda g(x, y)\)

😉 Although it doesn’t matter from a numerical perspective the Lagrangian is conventionally formed with a minus for maximization problems and a plus for minimization problems. It does matter if \(\lambda\) has an interpretation, like in economics.

Such system is shown below.

\(\begin{cases} \cfrac{\partial \mathcal{L}}{\partial x} = 0\\ \cfrac{\partial \mathcal{L}}{\partial y} = 0\\ \cfrac{\partial \mathcal{L}}{\partial \lambda} = 0 \end{cases}\)

Recall our problem is \(\min 5x - 3y \text{ s.t } x^2 + y^2 = 136\).

The Lagrangian for this problem is \(\mathcal{L}(x, y, \lambda) = 5x - 3y - \lambda(x^2 + y^2 - 136)\)

\(\begin{cases} \cfrac{\partial \mathcal{L}}{\partial x}5x - 3y - \lambda(x^2 + y^2 - 136) = 0\\ \cfrac{\partial \mathcal{L}}{\partial y}5x - 3y - \lambda(x^2 + y^2 - 136) = 0\\ \cfrac{\partial \mathcal{L}}{\partial \lambda}5x - 3y - \lambda(x^2 + y^2 - 136) = 0 \end{cases}\)

\(\begin{cases} 5 - \lambda 2x = 0\\ -3 - \lambda 2y = 0\\ x^2 + y^2 - 136 = 0 \end{cases}\)

Which is exactly the system we solved earlier.

\(\begin{cases} 5 = \lambda 2x\\ -3 = \lambda 2y\\ x^2 + y^2 = 136 \end{cases}\)

Convex optimization#

🔑 Convex optimization entails the minimization of convex objective functions over a feasible convex sets. If the objective function and the constraints are both linear the problem is solved by linear programming. If the objective function is quadratic and the constraints are linear the problem is solved by quadratic programming.

Let \(\textbf{P}\) a convex optimization problem

\(\begin{array}{l} \textbf{P} \\ \min \limits_{x} f(x) \\ \text{subject to} \quad g_i(x) \ge 0 \quad \forall i=1,\dots,m \end{array}\)

where:

  • \(f(x)\) is a convex function

  • \(g_i(x)\) are linear functions defining a convex set

🔑 A function is convex if every point on the line segment connecting any two points \(a\) and \(b\) on the function lies completely above the function

36028c61e75b481a8484d817f6695a5a Source: www.wikipedia.org

🔑 A set is convex if every point on the line segment connecting any two points \(a\) and \(b\) in the set lies completely within the set

eea7aef355db445798b7df478eb35c9d Source: www.hacarus.com

Let’s take a look at the convex set created by the linear inequality constraint \(x \ge y\).

[79]:
fig, ax = plt.subplots()
x_ = np.linspace(-30, 30)
y_ = x_
ax.plot(x_, y_, "tab:blue", lw=1)
ax.scatter(5, -5, color="tab:blue")
ax.text(3, -4.5, "5 - (-5) = 10", color="tab:blue")
ax.scatter(-5, 5, color="tab:orange")
ax.text(-6.5, 5.5, "-5 - 5 = -10", color="tab:orange")
ax.fill_between(x_, y_, -10, color="tab:blue", alpha=0.1)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-10, 10)
ax.set_ylim(-10, 10)
plt.title("The linear constraint $x - y \\geq 0$ defines a convex set", pad=20.0)
plt.show()
../_images/notebooks_calculus_215_0.png

The point \((-5, 5)\) doesn’t satisfy the constraint because \(-5 < 5\), or alternatively \(-10 < 0\).

The point \((5, -5)\) does satisfy the constraint because \(5 \ge -5\) and so it does any point on and below the line \(y = x\).

Moreover, we can visually verify that the half space below the line is convex because every point on the line segment connecting any two points \(a\) and \(b\) in the set lies completely within the set.

🔑 Any linear constraint defines a convex set and a set of simultaneous linear constraints defines the intersection of convex sets, so it is also a convex set

Let’s take a look at the convex set resulting from the following \(g_i(x, y)\) inequality constraints

\(y - 1 \ge 0\)

\(x + 9 - y \ge 0\)

\(-\cfrac{3}{2}x + 9 - y \ge 0\)

\(\cfrac{1}{2}x + 6 - y \ge 0\)

\(-\cfrac{6}{5}x + 6 - y \ge 0\)

[80]:
def _update(frame):
    global convex_set, half_space, in_set, out_set
    half_space.remove()
    convex_set.remove()
    in_set.remove()
    out_set.remove()
    ax.plot(x_, Y[:, frame], "tab:blue", lw=1)
    line = LineString(list(zip(x_, Y[:, frame])))
    poly = Polygon([line.coords[0], line.coords[-1], vertices[frame]])
    (half_space,) = ax.fill(*poly.exterior.xy, color="tab:blue", alpha=0.1)
    if frame > 0:
        poly_inter = poly.intersection(polygons[-1])
    else:
        poly_inter = poly
    polygons.append(poly_inter)
    (convex_set,) = ax.fill(*poly_inter.exterior.xy, color="tab:blue", alpha=0.5)
    in_set = ax.scatter(0, 1, color="tab:blue", marker="o", label="In set")
    ax.spines[["left", "bottom"]].set_position(("data", 0))
    out_set = ax.scatter(
        5 / 2,
        5,
        color="tab:blue" if frame < 4 else "tab:orange",
        marker="d",
        label="In set" if frame < 4 else "Not in set",
        zorder=2,
    )
    ax.legend(loc="upper right")


fig, ax = plt.subplots()
x_ = np.linspace(-30, 30, 250)
Y = np.zeros((250, 5))
Y[:, 0] = np.full_like(x_, 1)
m = np.array([1, -1.5, 0.5, -1.2])
b = np.array([9, 9, 6, 6])
Y[:, 1:] = m * x_.reshape(-1, 1) + b
vertices = [(-30, 30), (30, -30), (-30, -30), (30, -30), (-30, -30)]
polygons = []
(half_space,) = ax.fill([], [])
(convex_set,) = ax.fill([], [])
in_set = ax.scatter([], [])
out_set = ax.scatter([], [])
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-10, 10)
ax.set_ylim(-5, 10)
plt.title(
    "Multiple linear constraints define the intersection of convex sets", pad=20.0
)

ani = FuncAnimation(fig, _update, frames=Y.shape[1], repeat=False, interval=1000)
html_animation = ani.to_jshtml(default_mode="loop")
if "runtime" not in get_ipython().config.IPKernelApp.connection_file:
    html_animation = fixup_animation_js(html_animation)
display(HTML(html_animation))
plt.close()

Let’s see if \((0, 1)\) belongs to the feasible set

\(0 \ge 0\)

\(8 \ge 0\)

\(8 \ge 0\)

\(5 \ge 0\)

\(5\ge 0\)

All constraints are satisfied.

What about \((\cfrac{5}{2}, 5)\)?

\(4 \ge 0\)

\(\cfrac{13}{2} \ge 0\)

\(\cfrac{1}{4} \ge 0\)

\(\cfrac{9}{4} \ge 0\)

\(-2 < 0\)

The last constraint is not satisfied.

Inequality constraints and the Karush–Kuhn–Tucker conditions#

Let \(\textbf{P}\) a problem with \(m\) inequality constraints

\(\begin{array}{l} \textbf{P} \\ \min \limits_{x} f(x) \\ \text{subject to} \quad g_i(x) \ge 0 \quad \forall i=1,\dots,m \end{array}\)

It turns out inequality constraints can be formulated as equality constraints by introducing a non-negative slack variable \(s\).

\(\begin{array}{l} \min \limits_{x} f(x) \\ \text{subject to} \quad g_i(x) - s_i = 0 \end{array}\)

  • If \(g_i(x)>0\): The slack variable \(s_i\) takes the value \(−g_i(x)\), so the sum is zero.

  • If \(g_i(x)=0\): The slack variable \(s_i\) is zero.

  • If \(g_i(x)<0\): No non-negative value of \(s_i\) can make the sum zero, which makes the equality unachievable, so it honours the original constraint.

The Lagrangian of this problem is

\(\mathcal{L}(x, \lambda, s) = f(x) - \sum \limits_{i=1}^{m} \lambda_i(g_i(x) - s_i)\)

This problem has an optimal solution only if the complementary slackness condition is met.

🔑 The complementary slackness condition states that at the optimal solution, either the Lagrange multipliers \(\lambda_i\) are zero or the respective constraints are active (the equality constraint \(g_i(x^{\star}) = 0\) holds).

The complementary slackness condition is expressed as \(\lambda_i g_i(x^{\star}) = 0\) \(\forall i=1,\dots,m\).

While it was useful to see the slack variable, in practice we do not add the slack variables but rather we “limit” the Lagrange multipliers for inequality constraints as non-negative.

\(\begin{array}{l} \textbf{P} \\ \min \limits_{x} f(x) \\ \text{subject to} \quad g_i(x) \ge 0 \quad \forall i=1,\dots,m \end{array}\)

The Lagrangian of \(\textbf{P}\) is simply as if \(g_i(x)\) were equality constraints.

\(\mathcal{L}(x, \lambda) = f(x) - \sum \limits_{i=1}^{m} \lambda_i g_i(x)\)

Note that we haven’t explicitly added any constraints for the Lagrange multipliers to be non-negative.

It’s just an additional condition for the problem to have an optimal solution, called dual feasibility.

🔑 The dual feasibility condition states that at the optimal solution, the Lagrange multipliers \(\lambda_i\) must be non-negative.

The dual feasibility condition is expressed as \(\lambda_i \ge 0\) \(\forall i=1,\dots,m\).

The complementary slackness and dual feasibility conditions are two of the four conditions known as Karush–Kuhn–Tucker (KKT) conditions.

The other two, stationarity and primal feasibility conditions simply state that at the optimal solution the gradient of the Lagrangian must be zero and all the constraints are satisfied.

As an example, let \(\textbf{Q}\) a convex optimization problem with a quadratic objective function and three linear inequality constraint defining a bounded convex set.

\(\begin{array}{l} \textbf{Q} \\ \min \limits_{x, y} x^2+y^2 \\ \text{subject to} \quad y + 4 \ge 0 \\ \quad \quad \quad \quad \quad x - y -\cfrac{3}{2} \ge 0 \\ \quad \quad \quad \quad \quad - \cfrac{3}{2}x - y + 9 \ge 0 \end{array}\)

Let’s build our intuition of this problem by looking at the contour plot.

[81]:
x, y = sp.symbols("x, y")
f = x**2 + y**2

x_ = np.linspace(-30, 30, 250)
y_ = np.linspace(-30, 30, 250)
xx, yy = np.meshgrid(x_, y_)

fig, ax = plt.subplots()
cs = ax.contour(
    xx,
    yy,
    sp.lambdify((x, y), f, "numpy")(xx, yy),
    levels=np.floor(np.linspace(1, 50, 11)),
    cmap="RdBu_r",
)
polygons = []
Y = np.zeros((250, 3))
Y[:, 0] = np.full_like(x_, -4)
m = np.array([1, -1.5])
b = np.array([-1.5, 9])
Y[:, 1:] = m * x_.reshape(-1, 1) + b
vertices = [(-30, 30), (30, -30), (-30, -30)]
for i in range(Y.shape[1]):
    line = LineString(list(zip(x_, Y[:, i])))
    poly = Polygon([line.coords[0], line.coords[-1], vertices[i]])
    if i > 0:
        poly_inter = poly.intersection(polygons[-1])
    else:
        poly_inter = poly
    polygons.append(poly_inter)
ax.fill(
    *poly_inter.exterior.xy,
    color="tab:blue",
    alpha=0.5,
    edgecolor="k",
    linewidth=2,
    zorder=2
)

ax.clabel(cs, inline=True, fontsize=10)
ax.spines[["left", "bottom"]].set_position(("data", 0))
ax.text(1.02, 0, "$x$", transform=ax.get_yaxis_transform())
ax.text(0, 1.02, "$y$", transform=ax.get_xaxis_transform())
ax.set_xlim(-10, 10)
ax.set_ylim(-10, 10)
ax.set_aspect("equal")
plt.title("The problem $\\text{Q}$", pad=20.0);
../_images/notebooks_calculus_220_0.png

We can visually see that the minimum should be around \((0.75, -0.75)\) where one of the sides of the triangle is tangent to the \(f(x, y) = 1\) contour.

[82]:
ax.scatter(
    0.75,
    -0.75,
    marker="*",
    color="tab:orange",
    s=100,
    edgecolors="k",
    linewidth=0.5,
    label="optimal solution",
    zorder=2,
)
ax.legend()
fig
[82]:
../_images/notebooks_calculus_222_0.png

Let’s verify that \((0.75, -0.75)\) is indeed the optimal solution using the KKT conditions.

Before we do that we need to find the optimal values for \(\lambda_1\), \(\lambda_2\) and \(\lambda_3\).

Since \((0.75, -0.75)\) lies on the boundary of the second constraint, the second constraint is active. In fact,

\(\cfrac{3}{4} - (-\cfrac{3}{4}) -\cfrac{3}{2} = 0\)

As a consequence, the others constraints must be inactive. Thus, as per the complementary slackness condition, their respective Lagrange multiplies must be 0.

So for now we have that \(\lambda_1 = 0\) and \(\lambda_3 = 0\).

Let’s find \(\lambda_2\) by setting the partial derivatives of the Lagrangian to 0 and plugging in the optimal solution as well as \(\lambda_1 = 0\) and \(\lambda_3 = 0\).

The Lagrangian of \(\textbf{Q}\) is

\(\mathcal{L}(x, y, \lambda) = x^2 + y^2 - \lambda_{1} (y + 4) - \lambda_{2} (x - y - \cfrac{3}{2}) - \lambda_{3} (-\cfrac{3}{2} x - y + 9)\)

The partial derivatives wrt to \(x\) and \(y\) are

\(\cfrac{\partial\mathcal{L}}{\partial x} = 2x - \lambda_{2} + \cfrac{3}{2}\lambda_{3}\)

\(\cfrac{\partial\mathcal{L}}{\partial y} = 2y - \lambda_{1} + \lambda_{2} + \lambda_{3}\)

At the optimal solution and for \(\lambda_1 = 0\) and \(\lambda_3 = 0\)

\(\cfrac{\partial\mathcal{L}}{\partial x} = 0\)

\(2\cfrac{3}{4} - \lambda_{2} = 0\)

\(\lambda_{2} = \cfrac{3}{2}\)

\(\cfrac{\partial\mathcal{L}}{\partial y} = 0\)

\(-2\cfrac{3}{4} + \lambda_{2} = 0\)

\(\lambda_{2} = \cfrac{3}{2}\)

We can now verify that we’ve satisfied all the KKT conditions for this problem, which are shown below.

  1. Stationarity

\(\nabla \mathcal{L}(x^{\star},y^{\star}) = 0\)

  1. Primal feasibility

\(h_1(x^{\star},y^{\star}) \ge 0\)

\(h_2(x^{\star},y^{\star}) \ge 0\)

\(h_3(x^{\star},y^{\star}) \ge 0\)

  1. Dual feasibility

\(\lambda_1^{\star} \ge 0\)

\(\lambda_2^{\star} \ge 0\)

\(\lambda_2^{\star} \ge 0\)

  1. Complimentary slackness

\(\lambda_1^{\star}(h_1(x^{\star},y^{\star})) = 0\)

\(\lambda_2^{\star}(h_2(x^{\star},y^{\star})) = 0\)

\(\lambda_2^{\star}(h_3(x^{\star},y^{\star})) = 0\)

Let’s verify them all.

[83]:
x, y = sp.symbols("x, y")
lam1, lam2, lam3 = sp.symbols("lambda_1, lambda_2, lambda_3", nonnegative=True)
f = x**2 + y**2
g1 = y + 4
g2 = x - y - sp.Rational(3, 2)
g3 = -sp.Rational(3, 2) * x - y + 9
lagrangian = f - lam1 * (g1) - lam2 * (g2) - lam3 * (g3)
dLx = sp.diff(lagrangian, x)
dLy = sp.diff(lagrangian, y)

sol = {x: 0.75, y: -0.75, lam1: 0, lam2: 1.5, lam3: 0}
# stationarity
assert dLx.subs(sol) == 0
assert dLy.subs(sol) == 0
# primal feasibility
assert g1.subs(sol) >= 0
assert g2.subs(sol) >= 0
assert g3.subs(sol) >= 0
# dual feasibility
assert sol[lam1] >= 0
assert sol[lam2] >= 0
assert sol[lam3] >= 0
# complementary slackness
assert sol[lam1] * g1.subs(sol) == 0
assert sol[lam2] * g2.subs(sol) == 0
assert sol[lam3] * g3.subs(sol) == 0

Let’s solve this problem analytically.

[84]:
solutions = sp.solve(
    [dLx, dLy, lam1 * g1, lam2 * g2, lam3 * g3], [x, y, lam1, lam2, lam3], dict=True
)
valid_solutions = []
for sol in solutions:
    if g1.subs(sol) >= 0 and g2.subs(sol) >= 0 and g3.subs(sol) >= 0:
        valid_solutions.append(sol)

for sol in valid_solutions:
    for k, v in sol.items():
        display(Math(sp.latex(k) + "=" + sp.latex(v)))
$\displaystyle \lambda_{1}=0$
$\displaystyle \lambda_{2}=\frac{3}{2}$
$\displaystyle \lambda_{3}=0$
$\displaystyle x=\frac{3}{4}$
$\displaystyle y=- \frac{3}{4}$

Wolfe dual problem#

🔑 The duality principle states that optimization problems may be viewed from either of two perspectives, the primal problem or the dual problem. If the primal is a minimization problem then the dual is a maximization problem (and vice versa).

As before, let \(\textbf{P}\) a convex optimization problem with \(m\) inequality constraints

\(\begin{array}{l} \textbf{P} \\ \min \limits_{x} f(x) \\ \text{subject to} \quad g_i(x) \ge 0 \quad \forall i=1,\dots,m \end{array}\)

This is the primal problem and the Lagrangian of \(\textbf{P}\) is the usual

\(\mathcal{L}(x, \lambda) = f(x) - \sum \limits_{i=1}^{m} \lambda_i g_i(x)\)

For a convex problem \(\textbf{P}\), its corresponding dual problem is termed as the Wolfe dual.

\(\begin{array}{l} \textbf{WD}_P \\ \max \limits_{\lambda} h(\lambda)\\ \text{subject to} \quad \lambda_i \ge 0 \quad \forall i=1,\dots,m \end{array}\)

where \(h(\lambda) = \min \limits_{x} \mathcal{L}(x, \lambda)\).

Recall the KKT conditions and note how the dual feasibility condition \(\lambda \ge 0\) refers to the satisfaction of the constraints of the dual problem, similarly to how the primal feasibility condition \(g(x) \ge 0\) refers to the constraints of the constraints of the primal problem.

\(h(\lambda)\) is the Lagrangian uniquely expressed as a linear function of \(\lambda\) evaluated at the optimal solution \(x^{\star}\).

It’s obtained by setting the partial derivative of the Lagrangian wrt x to 0, solving it for \(x^{\star}\) and substituting its equation back into the Lagrangian so \(x\) is removed.

This is best explained with an example.

Recall, the problem \(\textbf{Q}\)

\(\begin{array}{l} \textbf{Q} \\ \min \limits_{x, y} x^2+y^2 \\ \text{subject to} \quad y + 4 \ge 0 \\ \quad \quad \quad \quad \quad x - y -\cfrac{3}{2} \ge 0 \\ \quad \quad \quad \quad \quad - \cfrac{3}{2}x - y + 9 \ge 0 \end{array}\)

whose Lagrangian is

\(\mathcal{L}(x, y, \lambda) = x^2 + y^2 - \lambda_{1} (y + 4) - \lambda_{2} (x - y - \cfrac{3}{2}) - \lambda_{3} (-\cfrac{3}{2} x - y + 9)\)

To obtain \(h(\lambda) = \min \limits_{x} \mathcal{L}(x, \lambda)\) we set the partial derivatives of the Lagrangian to 0, solve for \(x^{\star}\) and \(y^{\star}\) and substitute their respective linear functions of \(\lambda\) back into the Lagrangian.

\(\cfrac{\partial\mathcal{L}}{\partial x} = 0\)

\(2x^{\star} - \lambda_{2} + \cfrac{3}{2}\lambda_{3} = 0\)

\(x^{\star} = \cfrac{2\lambda_{2}-3\lambda_{3}}{4}\)

\(\cfrac{\partial\mathcal{L}}{\partial y} = 0\)

\(2y - \lambda_{1} + \lambda_{2} + \lambda_{3} = 0\)

\(y^{\star} = \cfrac{\lambda_{1} - \lambda_{2} - \lambda_{3}}{2}\)

\(h(\lambda) = \cfrac{(2 \lambda_{2} - 3 \lambda_{3})^{2}}{16} + \cfrac{(\lambda_{1} - \lambda_{2} - \lambda_{3})^{2}}{4} - \lambda_{1} \left(\cfrac{\lambda_{1} - \lambda_{2} - \lambda_{3}}{2} + 4\right) - \lambda_{2} \left(\cfrac{2 \lambda_{2} - 3 \lambda_{3}}{4} - \cfrac{\lambda_{1} - \lambda_{2} - \lambda_{3}}{2} - \cfrac{3}{2}\right) - \lambda_{3} \left(- \cfrac{3 (2 \lambda_{2} - 3 \lambda_{3})}{8} - \cfrac{\lambda_{1} - \lambda_{2} - \lambda_{3}}{2} + 9\right)\)

The Wolfe dual problem can equivalently be formulated as

\(\begin{array}{l} \textbf{WD}_P \\ \max \limits_{x, \lambda} \mathcal{L}(x, \lambda)\\ \text{subject to} \quad \lambda_i \ge 0 \quad \forall i=1,\dots,m \\ \quad \quad \quad \quad \quad \cfrac{\partial}{\partial x}\mathcal{L}(x, \lambda) = 0 \end{array}\)

To analytically solve a minimization problem using the Lagrangian dual we can follow these steps:

  1. Solve the Lagrangian dual for all \(\lambda_i\). We might need to solve it for different combinations of active constraints.

  2. Evaluate the equations for \(x^{\star}\) and \(y^{\star}\) used to created \(h(\lambda)\)

  3. Check the KKT conditions

[85]:
x, y = sp.symbols("x, y")
lam1, lam2, lam3 = sp.symbols("lambda_1, lambda_2, lambda_3", nonnegative=True)
f = x**2 + y**2
g1 = y + 4
g2 = x - y - sp.Rational(3, 2)
g3 = -sp.Rational(3, 2) * x - y + 9
lagrangian = f - lam1 * (g1) - lam2 * (g2) - lam3 * (g3)
h = {
    x: sp.solve(sp.diff(lagrangian, x), x)[0],
    y: sp.solve(sp.diff(lagrangian, y), y)[0],
}
lagrangian_dual = lagrangian.subs(h)
eval_set = [
    dict(ChainMap(*t))
    for t in [
        c
        for i in range(1, 4)
        for c in combinations([{lam1: 0}, {lam2: 0}, {lam3: 0}], i)
    ]
]
valid_solutions = []
for e in eval_set:
    lambda_sol = sp.solve(
        [
            sp.diff(lagrangian_dual.subs(e), lam1),
            sp.diff(lagrangian_dual.subs(e), lam2),
            sp.diff(lagrangian_dual.subs(e), lam3),
        ],
        [lam1, lam2, lam3],
        dict=True,
    )
    if lambda_sol:
        for s in lambda_sol:
            x_star = h[x].subs(e | s)
            y_star = h[y].subs(e | s)
            sol = dict(ChainMap(e, s, {x: x_star, y: y_star}))
            assert (
                g1.subs(sol) >= 0 and g2.subs(sol) >= 0 and g3.subs(sol) >= 0
            ), "Primal feasibility not satisfied"
            assert (
                sol[lam1] * g1.subs(sol) == 0
                and sol[lam2] * g2.subs(sol) == 0
                and sol[lam3] * g3.subs(sol) == 0
            ), "Complementary slackness not satisfied"
            valid_solutions.append(sol)
for sol in valid_solutions:
    for k, v in sol.items():
        display(Math(sp.latex(k) + "=" + sp.latex(v)))
$\displaystyle x=\frac{3}{4}$
$\displaystyle y=- \frac{3}{4}$
$\displaystyle \lambda_{2}=\frac{3}{2}$
$\displaystyle \lambda_{3}=0$
$\displaystyle \lambda_{1}=0$

This might look like an unnecessary convoluted way to get to the same result we got by solving the primal problem directly, but the Lagrangian dual has become a common approach to solving optimization problems.

Many problems can be efficiently solved by constructing the Lagrangian function of the problem and solving the dual problem instead of the primal problem.

Optimizing SVM#

Recall in Linear Algebra we applied concepts of linear algebra to SVM.

We derived the margin \(\gamma(w, b) = \min\cfrac{|wx_i+b|}{\|w\|}\) from the formula of the distance between a point a line \(d = \cfrac{|ax_0 + by_0 + b|}{\sqrt{a^2 + b^2}}\).

We then translated the objective in plain english “find the decision boundary that maximizes the margin between two classes such that the points belonging to each class lie on the correct side of the boundary” to the following constrained problem

\(\begin{array}{l} \min \limits_{w, b} \cfrac{1}{2} w \cdot w \\ \text{subject to } y_i(w \cdot x_i + b) \ge 1 \text{ } \forall i = 1, \dots, m \end{array}\)

We used scipy.minimize to solve it but we now have all the tools (Lagrange multipliers, KKT conditions, Wolfe dual) to solve it ourselves.

The Wolfe dual is

\(\begin{array}{l} \max \limits_{\alpha} h(\alpha)\\ \text{subject to} \quad \alpha_i \ge 0 \quad \forall i=1,\dots,m \end{array}\)

where \(h(\alpha) = \min \limits_{w, b} \mathcal{L}(w,b,\alpha)\).

Note we use \(\alpha\) instead of \(\lambda\) because it’s the notation commonly used in the SVM literature.

Let’s create the Lagrangian dual.

Step 1: Form the Lagrangian function of the primal

\(\mathcal{L}(w, b, \alpha) = \cfrac{1}{2} w \cdot w - \sum \limits_{i=1}^{m} \alpha_i[y_i(w \cdot x_i + b) - 1]\)

Step 2a: Calculate the partial derivative of \(\mathcal{L}(w, b, \alpha)\) wrt \(w\), set it to 0, solve for \(w^{\star}\) and substitute it into \(\mathcal{L}(w, b, \alpha)\) to obtain \(h(b, \alpha)\)

\(\cfrac{\partial}{\partial w}\mathcal{L} = 0\)

\(w^{\star} - \sum \limits_{i=1}^{m} \alpha_iy_ix_i = 0\)

\(w^{\star} = \left(\sum \limits_{i=1}^{m} \alpha_iy_ix_i\right)\)

\(h(b, \alpha) = \cfrac{1}{2} \left(\sum \limits_{i=1}^{m} \alpha_iy_ix_i\right) \cdot \left(\sum \limits_{j=1}^{m} \alpha_jy_jx_j\right) - \sum \limits_{i=1}^{m} \alpha_i[y_i(\left(\sum \limits_{j=1}^{m} \alpha_jy_jx_j\right) \cdot x_i + b) - 1]\)

\(= \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j - \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j - b \sum \limits_{i=1}^{m} \alpha_iy_i + \sum \limits_{i=1}^{m} \alpha_i\)

\(= - \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j - b \sum \limits_{i=1}^{m} \alpha_iy_i + \sum \limits_{i=1}^{m} \alpha_i\)

Step 2b: Calculate the partial derivative of \(\mathcal{L}(w, b, \alpha)\) wrt \(b\), set it to 0, solve for \(b^{\star}\) and substitute it into \(\mathcal{L}(w, b, \alpha)\) to obtain \(h(\alpha)\)

\(\cfrac{\partial}{\partial b}\mathcal{L} = 0\)

\(-\sum \limits_{i=1}^{m} \alpha_iy_i = 0\)

To impose the partial derivative wrt to \(b\) to be 0 we’ll add a constraint to the dual problem.

\(\begin{array}{l} \max \limits_{\alpha} h(\alpha) \\ \text{subject to} \quad \alpha_i \ge 0 \quad \forall i=1,\dots,m \\ \quad \quad \quad \quad \quad \sum \limits_{i=1}^{m} \alpha_iy_i = 0 \end{array}\)

So we get that \(h(\alpha)\) is

\(h(\alpha) = - \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j - b\underbrace{\sum \limits_{i=1}^{m} \alpha_iy_i}_{0} + \sum \limits_{i=1}^{m} \alpha_i\)

\(= - \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j + \sum \limits_{i=1}^{m} \alpha_i\)

And here’s the dual we need to solve.

\(\begin{array}{l} \max \limits_{\alpha} \sum \limits_{i=1}^{m} \alpha_i - \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j \\ \text{subject to} \quad \alpha_i \ge 0 \quad \forall i=1,\dots,m \\ \quad \quad \quad \quad \quad \sum \limits_{i=1}^{m} \alpha_iy_i = 0 \end{array}\)

Once we’ve solved the Lagrangian dual for the multipliers \(\alpha\)’s, we get \(w^{\star}\) from

\(w^{\star} = \sum \limits_{i=1}^{m} \alpha_iy_ix_i\)

To get \(b^{\star}\), though, the process is a little bit more complicated.

We can use the constraint \(y_i(w \cdot x_i + b) - 1 \ge 0 \quad \forall i=1,\dots,m\).

Recall for the complementary slackness condition the constraint is equal to 0 when is active, so that \(\alpha_i\) may be greater than 0.

\(y_i(w \cdot x_i^{SV} + b) - 1 = 0 \quad \forall i=1,\dots,s\)

Note we put a \(SV\) superscript on the \(x_i\)’s for which the constraint is active, and \(s\) is the number of active constraints.

\(x_i^{SV}\) are the support vectors which the algorithm takes its name from.

We replace \(w\) with \(w^{\star}\) and we solve for \(b^{\star}\).

\(\sum \limits_{i=1}^{s} y_i(w^{\star} \cdot x_i^{SV} + b^{\star}) - 1 = 0\)

We multiply both sides by \(y_i\).

\(y_i \left[\sum \limits_{i=1}^{s} y_i(w^{\star} \cdot x_i^{SV} + b^{\star}) - 1 \right] = 0y_i\)

\(\sum \limits_{i=1}^{s} \left[y_i^2(w^{\star} \cdot x_i^{SV} + b^{\star}) - y_i \right] = 0\)

Because \(y_i \in \{-1, 1\}\) then \(y_i^2 = 1\)

\(\sum \limits_{i=1}^{s} w^{\star} \cdot x_i^{SV} + \sum \limits_{i=1}^{s} b^{\star} - \sum \limits_{i=1}^{s} y_i = 0\)

\(\sum \limits_{i=1}^{s} b^{\star} = \sum \limits_{i=1}^{s} y_i - \sum \limits_{i=1}^{s} w^{\star} \cdot x_i^{SV}\)

\(b^{\star} = \cfrac{1}{s} \sum \limits_{i=1}^{s} (y_i - w^{\star} \cdot x_i^{SV})\)

Let’s try to solve a small problem analytically.

[86]:
m = 6
k = 2
neg_centroid = [-1, -1]
pos_centroid = [1, 1]

rng = np.random.default_rng(1)
X = np.r_[
    rng.standard_normal((m // 2, k)) + neg_centroid,
    rng.standard_normal((m // 2, k)) + pos_centroid,
].T
Y = np.array([[-1.] * (m // 2) + [1.] * (m // 2)])

fig, ax = plt.subplots()
ax.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, "tab:orange", "tab:blue"))
for i in range(m):
    ax.text(*X[:, i]+0.2, i+1, color="tab:orange" if Y[0, i] == -1 else "tab:blue")
ax.set_aspect("equal")
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.set_xlim(-5, 5)
ax.set_ylim(-5, 5)
plt.title("Classification data")
plt.show()
../_images/notebooks_calculus_232_0.png

It seems that the support vectors are 1 and 4.

This helps us solve the Lagrangian dual analytically because we only need to solve it for \(\alpha_1\) and \(\alpha_2\). We’ll set the others to 0.

[87]:
i = sp.symbols("i", cls=sp.Idx)
a = sp.IndexedBase("a", nonnegative=True)
x1 = sp.IndexedBase("x1")
x2 = sp.IndexedBase("x2")
y = sp.IndexedBase("y")
w1, w2, b = sp.symbols("w1, w2, b")
w = sp.Matrix([w1, w2])
x = sp.Matrix([x1[i], x2[i]])
f = (0.5 * w.T * w)[0]
g = y[i] * ((w.T * x)[0] + b) - 1
lagrangian = f - sp.Sum(a[i] * g, (i, 1, m))
dLw1 = sp.diff(lagrangian, w)[0]
dLw2 = sp.diff(lagrangian, w)[1]
dLb = sp.diff(lagrangian, b)
w1_star = sp.solve(dLw1, w1)[0]
w2_star = sp.solve(dLw2, w2)[0]
lagrangian_dual = (
    lagrangian.subs({w1: w1_star, w2: w2_star})
    .expand()
    .subs(sp.Sum(b * a[i] * y[i], (i, 1, m)), 0)
    .simplify()
)

display(Math("\\mathcal{h}(\\alpha) =" + sp.latex(lagrangian_dual)))
$\displaystyle \mathcal{h}(\alpha) =- 0.5 \left({a}_{1} {x_{1}}_{1} {y}_{1} + {a}_{2} {x_{1}}_{2} {y}_{2} + {a}_{3} {x_{1}}_{3} {y}_{3} + {a}_{4} {x_{1}}_{4} {y}_{4} + {a}_{5} {x_{1}}_{5} {y}_{5} + {a}_{6} {x_{1}}_{6} {y}_{6}\right)^{2} - 0.5 \left({a}_{1} {x_{2}}_{1} {y}_{1} + {a}_{2} {x_{2}}_{2} {y}_{2} + {a}_{3} {x_{2}}_{3} {y}_{3} + {a}_{4} {x_{2}}_{4} {y}_{4} + {a}_{5} {x_{2}}_{5} {y}_{5} + {a}_{6} {x_{2}}_{6} {y}_{6}\right)^{2} + {a}_{1} + {a}_{2} + {a}_{3} + {a}_{4} + {a}_{5} + {a}_{6}$

It turns out the system of equations resulting from the partial derivatives of \(h(\alpha)\) is quite complicated to be solved symbolically.

sympy conveniently features a numerical solver sympy.nsolve, although it’s not designed for it.

An alternative would be scipy.optimize.fsolve which can solve a system of non-linear equations.

We’ll stick with sympy for convenience, but later we’ll take a look at a low-level package specifically designed to solve quadratic programming problems called cvxopt.

sympy.nsolve requires an initial solution and we’ll also add verify=False to prevent above-tolerance errors.

We’ll use the KTT conditions to verify the solution ourselves.

[88]:
data = dict(
    ChainMap(
        *[{x1[j + 1]: X[0, j], x2[j + 1]: X[1, j], y[j + 1]: Y[0, j]} for j in range(m)]
    )
)
a_fix = {a[2]: 0.0, a[3]: 0.0, a[5]: 0.0, a[6]: 0.0}

for a_0 in np.linspace(0.1, 1, 50):
    a_sol = sp.nsolve(
        [sp.diff(lagrangian_dual.subs(data | a_fix), a[j + 1]) for j in range(m)]
        + [sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_fix)],
        [a[1], a[4]],
        [a_0, a_0],
        dict=True,
        verify=False,
    )
    a_sol = a_fix | a_sol[0]
    if sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_sol) == 0:
        break

a_sol = {
    k: v
    for k, v in sorted(zip(a_sol.keys(), a_sol.values()), key=lambda a: a[0].indices)
}
for k, v in a_sol.items():
    display(Math(sp.latex(k) + "=" + str(v)))
$\displaystyle {a}_{1}=0.412244897959184$
$\displaystyle {a}_{2}=0.0$
$\displaystyle {a}_{3}=0.0$
$\displaystyle {a}_{4}=0.412244897959184$
$\displaystyle {a}_{5}=0.0$
$\displaystyle {a}_{6}=0.0$

Let’s verify the stationary condition for the dual.

[89]:
for j in range(m):
    assert sp.diff(lagrangian_dual.subs(data | a_sol), a[j + 1]) == 0
assert sp.Sum(a[i] * y[i], (i, 1, m)).doit().subs(data | a_sol) == 0

Let’s calculate \(w^{\star}\) and \(b^{\star}\).

Recall

\(w^{\star} = \sum \limits_{i=1}^{m} \alpha_iy_ix_i\)

\(b^{\star} = \cfrac{1}{s} \sum \limits_{i=1}^{s} (y_i - w^{\star} \cdot x_i^{SV})\)

[90]:
alpha = np.array(list(a_sol.values()), dtype="float")
sv = np.argwhere(alpha > 1e-5).squeeze()
w_star = (alpha * Y) @ X.T
b_star = np.mean(Y[:, sv] - w_star @ X[:, sv])

display(Math("w^{\\star} =" + sp.latex(sp.Matrix(w_star))))
display(Math("b^{\\star} =" + str(b_star)))
$\displaystyle w^{\star} =\left[\begin{matrix}0.460668244204358 & 0.725344881755978\end{matrix}\right]$
$\displaystyle b^{\star} =-0.4646549582754853$

It turns out we could have calculated \(w^{\star}\) just using the support vectors too, because the others \(\alpha_i\) are 0’s.

Although in practice the \(\alpha_i\) are never exactly 0. But in this case we can verify that it would have produced the exact same result.

[91]:
assert np.array_equal(w_star, (alpha[sv] * Y[:, sv]) @ X[:, sv].T)

Let’s verify the KKT conditions for the primal.

[92]:
wb_sol = {w1: w_star[0][0], w2: w_star[0][1], b: b_star}
sol = data | a_sol | wb_sol
# stationarity
assert dLw1.doit().subs(sol) == 0
assert dLw2.doit().subs(sol) == 0
assert dLb.doit().subs(sol) == 0
# primal feasibility
for j in range(m):
    try:
        g_eval = g.subs({i: j + 1}).subs(sol)
        assert g_eval >= 0, f"constraint {j+1} not satisfied: {g_eval:.2f}"
    except AssertionError as e:
        print(e)
# dual feasibility
for j in range(m):
    assert a_sol[a[j + 1]] >= 0
# complementary slackness
for j in range(m):
    try:
        ag_eval = a_sol[a[j + 1]] * g.subs({i: j + 1}).subs(sol)
        assert (
            ag_eval == 0
        ), f"complementary slackness {j+1} not satisfied: {ag_eval:.2f}"
    except AssertionError as e:
        print(e)
constraint 1 not satisfied: -0.10
constraint 3 not satisfied: -0.09
constraint 4 not satisfied: -0.10
complementary slackness 1 not satisfied: -0.04
complementary slackness 4 not satisfied: -0.04

We’re not at the optimal solution, but we’re close.

Let’s plot it.

[93]:
plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, "tab:orange", "tab:blue"))
x1 = np.linspace(-4, 4, 100)
plt.plot(x1, (-w_star[0][0] * x1 - b_star) / w_star[0][1], color="k", alpha=0.5)
plt.scatter(
    X[0, sv],
    X[1, sv],
    s=80,
    facecolors="none",
    edgecolors="y",
    color="y",
    label="support vectors"
)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Solution of the dual using sympy.nsolve")
plt.gca().set_aspect("equal")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.legend()
plt.show()
../_images/notebooks_calculus_246_0.png

As mentioned earlier, sympy is not designed to solve this problem although in this case we managed to get an almost optimal solution.

We’ll now see how to solve this quadratic programming problem with the cvxopt library.

The cvxopt.solvers.qp function can solve the following primal quadratic programming problem

\(\begin{array}{l} \min \limits_{x} \cfrac{1}{2}x^TPx + q^Tx \\ \text{subject to} \quad Gx \preceq h \\ \quad \quad \quad \quad \quad Ax = b \end{array}\)

From the docstring of cvxopt.solvers.qp we see that it takes these parameters:

  • \(P\): \((m \times m)\) d (double) cvxopt.matrix

  • \(q\): \((m \times 1)\) d (double) cvxopt.matrix

  • \(G\): \((m \times m)\) d (double) cvxopt.matrix

  • \(h\): \((m \times 1)\) d (double) cvxopt.matrix

  • \(A\): \((1 \times m)\) d (double) cvxopt.matrix

  • \(b\): \((1 \times 1)\) d (double) cvxopt.matrix

The inequality constraints \(Gx \preceq h\) are

\(\begin{bmatrix} G_1 & 0 & \dots & 0 \\ 0 & G_2 & \dots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \dots & G_m \end{bmatrix} \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_m \\ \end{bmatrix} \preceq \begin{bmatrix} h_1 \\ h_2 \\ \vdots \\ h_m \\ \end{bmatrix} \Longrightarrow \begin{bmatrix} G_1x_1 \\ G_2x_2 \\ \vdots \\ G_mx_m \\ \end{bmatrix} \preceq \begin{bmatrix} h_1 \\ h_2 \\ \vdots \\ h_m \\ \end{bmatrix}\)

Note \(\preceq\) represent element-wise inequalities between two column vectors, in this case \(Gx\) which is \((m \times 1)\) and \(h\) which is also \((m \times 1)\).

The equality constraints \(Ax = b\) are

\(\begin{bmatrix} A_1 & A_2 \dots A_m \end{bmatrix} \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_m \\ \end{bmatrix}= \begin{bmatrix} b \end{bmatrix} \Longrightarrow A_1x_1 + A_2x_2 + \dots + A_mx_m= b\)

We need to re-write our problem so it conforms with cvxopt.solvers.qp requirements.

Recall the dual is

\(\begin{array}{l} \max \limits_{\alpha} \sum \limits_{i=1}^{m} \alpha_i - \cfrac{1}{2} \sum \limits_{i=1}^{m} \sum \limits_{j=1}^{m} \alpha_i\alpha_jy_iy_jx_i \cdot x_j \\ \text{subject to} \quad \alpha_i \ge 0 \quad \forall i=1,\dots,m \\ \quad \quad \quad \quad \quad \sum \limits_{i=1}^{m} \alpha_iy_i = 0 \end{array}\)

Writing as a minimization and with matrix notations becomes

\(\begin{array}{l} \min \limits_{\alpha} \cfrac{1}{2} \alpha^T (yy^Tx_i \cdot x_j) \alpha - \alpha \\ \text{subject to} \quad \alpha \succeq 0 \\ \quad \quad \quad \quad \quad y \alpha = 0 \end{array}\)

Then we note that: - \(x\) is \(\alpha\) (what cvxopt.solvers.qp solve for) and it’s a column vector - \(P\) corresponds to \((yy^Tx_i \cdot x_j)\) - \(q\) corresponds to a column vector of -1’s to make our minus a plus (\(q\) gets transposed) - \(G\) is a diagonal matrix of -1’s to make our \(\succeq\) a \(\preceq\) - \(h\) corresponds to a column vector of 0’s - \(A\) corresponds \(y\) which is a row vector - \(b\) corresponds to 0

Let \(P = yy^TK\), where \(K = x_i \cdot x_j\).

Let \(yy^T\) a square matrix resulting from the outer product of \(y\)

\(yy^T = \begin{bmatrix} y_1y_1&&\dots&&y_my_1\\ \vdots&&\ddots&&\vdots\\ y_my_1&&\dots&&y_my_m \end{bmatrix}\)

\(K = \begin{bmatrix} x_1 \cdot x_1&&\dots&&x_m \cdot x_1\\ \vdots&&\ddots&&\vdots\\ x_m \cdot x_1&&\dots&&x_m \cdot x_m \end{bmatrix}\)

[94]:
K = np.array([np.dot(X[:, i], X[:, j]) for j in range(m) for i in range(m)]).reshape(
    (m, m)
)
P_mat = cvxopt.matrix(np.outer(Y, Y.T) * K)
assert P_mat.size == (m, m)
assert P_mat.typecode == "d"

Q_mat = cvxopt.matrix(np.full(m, -1.0))
assert Q_mat.size == (m, 1)
assert Q_mat.typecode == "d"

G_mat = cvxopt.matrix(np.diag(np.full(m, -1.0)))
assert G_mat.size == (m, m)
assert G_mat.typecode == "d"

h_mat = cvxopt.matrix(np.zeros(m))
assert h_mat.size == (m, 1)
assert h_mat.typecode == "d"

A_mat = cvxopt.matrix(Y, (1, m))
assert A_mat.size == (1, m)
assert A_mat.typecode == "d"

b_mat = cvxopt.matrix(0.0)
assert b_mat.size == (1, 1)
assert b_mat.typecode == "d"

solution = cvxopt.solvers.qp(P_mat, Q_mat, G_mat, h_mat, A_mat, b_mat, options=dict(show_progress=False))
alpha = np.ravel(solution["x"])
sv = np.argwhere(alpha > 1e-5).squeeze()
print(f"The support vectors are {sv+1}")
The support vectors are [1 4]

It seems our assumption about the support vectors earlier was indeed correct.

[95]:
w_star = (alpha * Y) @ X.T
b_star = np.mean(Y[:, sv] - w_star @ X[:, sv])

display(Math("w^{\\star} =" + sp.latex(sp.Matrix(w_star))))
display(Math("b^{\\star} =" + str(b_star)))
$\displaystyle w^{\star} =\left[\begin{matrix}0.514414543271143 & 0.809980717346117\end{matrix}\right]$
$\displaystyle b^{\star} =-0.5188731465771707$

Let’s verify the KKT conditions.

[96]:
a_sol = {a[j+1]: a_val for j, a_val in enumerate(alpha)}
wb_sol = {w1: w_star[0][0], w2: w_star[0][1], b: b_star}
sol = data | a_sol | wb_sol
# stationarity
assert np.isclose(float(dLw1.doit().subs(sol)), 0, atol=1e-6)
assert np.isclose(float(dLw2.doit().subs(sol)), 0, atol=1e-6)
assert np.isclose(float(dLb.doit().subs(sol)), 0, atol=1e-6)
# primal feasibility
for j in range(m):
    try:
        g_eval = g.subs({i: j + 1}).subs(sol)
        assert g_eval >= 0, f"constraint {j} not satisfied: {g_eval:.2f}"
    except AssertionError as e:
        print(e)
# dual feasibility
for j in range(m):
    assert a_sol[a[j + 1]] >= 0
# complementary slackness
for j in range(m):
    try:
        ag_eval = float(a_sol[a[j + 1]] * g.subs({i: j + 1}).subs(sol))
        assert np.isclose(ag_eval, 0, atol=1e-6), f"complementary slackness {j} not satisfied: {ag_eval:.2f}"
    except AssertionError as e:
        print(e)

This time they are all satisfied.

Let’s plot it again.

[97]:
plt.scatter(X[0], X[1], c=np.where(Y.squeeze() == -1, "tab:orange", "tab:blue"))
x1 = np.linspace(-4, 4, 100)
plt.plot(x1, (-w_star[0][0] * x1 - b_star) / w_star[0][1], color="k", alpha=0.5)
plt.scatter(
    X[0, sv],
    X[1, sv],
    s=80,
    facecolors="none",
    edgecolors="y",
    color="y",
    label="support vectors"
)
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.title("Solution of the dual using cvxopt.qp")
plt.gca().set_aspect("equal")
plt.xlim(-5, 5)
plt.ylim(-5, 5)
plt.legend()
plt.show()
../_images/notebooks_calculus_254_0.png